## 1. Define the problem

For this project, our task is to design a machine learning model to predict if an item is new or used, and the evaluate the model over the held out test data.

The Dataset is already provided, but we have to  do some preprocessing work over it.

## 2. Gather the data

In this case, a function is provided to us to build the dataset, so we'll take a look over it.

Also, we'll built the dataset set into a dataframe so we can explore a little bit it data.

In [1]:
import json
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
def build_dataset():
    data = [json.loads(x) for x in open("../data/MLA_100k.jsonlines")]
    target = lambda x: x.get("condition")
    N = -10000
    X_train = data[:N]
    X_test = data[N:]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    for x in X_train:
        del x["condition"]
    for y in X_test:
        del y["condition"]

    return X_train, y_train, X_test, y_test

In [3]:
X_train, y_train, X_test, y_test = build_dataset()

In [4]:
len(list(X_train[0].keys()))

47

In [5]:
len(list(X_test[0].keys()))

47

In [6]:
len(X_train)

90000

In [7]:
len(y_train)

90000

In [8]:
len(X_test)

10000

In [9]:
len(y_test)

10000

Now, we'll build the dataframes. Also we won't have consideration about pictures, seller_address information (not all), seller_contact, shipping (not all), and many other features

In [10]:
from collections import defaultdict

In [11]:
keys_to_del = ['sub_status', 'deal_ids', 'seller_id',
               'variations', 'location', 'attributes', 'tags', 'parent_item_id', 'coverage_areas',
               'category_id', 'descriptions', 'last_updated', 'pictures', 'id', "non_mercado_pago_payment_methods",
               'thumbnail', 'date_created', 'secure_thumbnail', 'stop_time',
               'subtitle', 'start_time', 'permalink', 'geolocation']

In [12]:
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [13]:
def delete_not_used_keys(dataset: list) -> list:
    for item in dataset:
        for key in keys_to_del:
            del item[key]
    return dataset

In [14]:
X_train_copy = delete_not_used_keys(X_train_copy)
X_test_copy = delete_not_used_keys(X_test_copy)

In [15]:
def get_data(dataset: list) -> dict:
    data = defaultdict(list)
    for item in dataset:
        for key in item.keys():
            if key == "seller_address":
                data[key + "_country"].append(item[key]["country"]["name"])
                data[key + "_city"].append(item[key]["city"]["name"])
                data[key + "_state"].append(item[key]["state"]["name"])
            elif key == "shipping":
                data[key + "_free"].append(item[key]["free_shipping"])
            else:
                data[key].append(item[key])

    return data

In [16]:
data_train = get_data(X_train_copy)

In [17]:
data_test = get_data(X_test_copy)

In [18]:
data_train.keys()

dict_keys(['seller_address_country', 'seller_address_city', 'seller_address_state', 'warranty', 'seller_contact', 'base_price', 'shipping_free', 'site_id', 'listing_type_id', 'price', 'buying_mode', 'listing_source', 'international_delivery_mode', 'official_store_id', 'differential_pricing', 'accepts_mercadopago', 'original_price', 'currency_id', 'title', 'automatic_relist', 'status', 'video_id', 'catalog_product_id', 'initial_quantity', 'sold_quantity', 'available_quantity'])

In [19]:
data_test.keys()

dict_keys(['seller_address_country', 'seller_address_city', 'seller_address_state', 'warranty', 'seller_contact', 'base_price', 'shipping_free', 'site_id', 'listing_type_id', 'price', 'buying_mode', 'listing_source', 'international_delivery_mode', 'official_store_id', 'differential_pricing', 'accepts_mercadopago', 'original_price', 'currency_id', 'title', 'automatic_relist', 'status', 'video_id', 'catalog_product_id', 'initial_quantity', 'sold_quantity', 'available_quantity'])

In [24]:
def review_dataset_size(data: dict, what_data: str) -> None:
    size = 90000 if what_data == "train" else 10000
    print(f"{what_data.upper()}")
    print("=" * 5)
    for key in data.keys():
        print(f"The lenght for {key.upper()} is {len(data[key])}")
        assert len(data[key]) == size

review_dataset_size(data_train, what_data="train")
review_dataset_size(data_test, what_data="test")

TRAIN
=====
The lenght for SELLER_ADDRESS_COUNTRY is 90000
The lenght for SELLER_ADDRESS_CITY is 90000
The lenght for SELLER_ADDRESS_STATE is 90000
The lenght for WARRANTY is 90000
The lenght for SELLER_CONTACT is 90000
The lenght for BASE_PRICE is 90000
The lenght for SHIPPING_FREE is 90000
The lenght for SITE_ID is 90000
The lenght for LISTING_TYPE_ID is 90000
The lenght for PRICE is 90000
The lenght for BUYING_MODE is 90000
The lenght for LISTING_SOURCE is 90000
The lenght for INTERNATIONAL_DELIVERY_MODE is 90000
The lenght for OFFICIAL_STORE_ID is 90000
The lenght for DIFFERENTIAL_PRICING is 90000
The lenght for ACCEPTS_MERCADOPAGO is 90000
The lenght for ORIGINAL_PRICE is 90000
The lenght for CURRENCY_ID is 90000
The lenght for TITLE is 90000
The lenght for AUTOMATIC_RELIST is 90000
The lenght for STATUS is 90000
The lenght for VIDEO_ID is 90000
The lenght for CATALOG_PRODUCT_ID is 90000
The lenght for INITIAL_QUANTITY is 90000
The lenght for SOLD_QUANTITY is 90000
The lenght for 

##### Create Dataframes

In [182]:
train_df = pd.DataFrame(columns=list(data_train.keys()))

In [183]:
for col in train_df.columns:
    print(col)
    train_df[col] = data_train[col]

seller_address_country
seller_address_city
seller_address_state
warranty
seller_contact
base_price
shipping_free
site_id
listing_type_id
price
buying_mode
listing_source
international_delivery_mode
official_store_id
differential_pricing
accepts_mercadopago
original_price
currency_id
title
automatic_relist
status
video_id
catalog_product_id
initial_quantity
sold_quantity
available_quantity


In [184]:
test_df = pd.DataFrame(columns=list(data_test.keys()))

In [185]:
for col in test_df.columns:
    print(col)
    test_df[col] = data_test[col]

seller_address_country
seller_address_city
seller_address_state
warranty
seller_contact
base_price
shipping_free
site_id
listing_type_id
price
buying_mode
listing_source
international_delivery_mode
official_store_id
differential_pricing
accepts_mercadopago
original_price
currency_id
title
automatic_relist
status
video_id
catalog_product_id
initial_quantity
sold_quantity
available_quantity


In [186]:
train_df["target"] = y_train
test_df["target"] = y_test

In [31]:
def create_dataframe(data: dict, y: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(columns=list(data.keys()))
    for col in df.columns:
        df[col] = data[col]
    df["target"] = y

    return df

In [32]:
train_df = create_dataframe(data_train, y_train)
test_df = create_dataframe(data_test, y_test)

In [33]:
train_df.shape, test_df.shape

((90000, 27), (10000, 27))

##### Save Dataframes to csv files

In [38]:
def save_dataframe(df: pd.DataFrame, data_type: str) -> None:
    df.to_csv("../data/" + data_type + ".csv", index=False)

In [39]:
save_dataframe(train_df, data_type="train")
save_dataframe(test_df, data_type="test")