# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import os

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# helper functions
import functions
from functions import *

# 2. IMPORT

In [None]:
# import data
train = pd.read_csv('../data/X_train.csv', sep = ',', decimal = '.', error_bad_lines = False)
y     = pd.read_csv('../data/y_train.csv', sep = ',', decimal = '.', error_bad_lines = False)
test  = pd.read_csv('../data/X_test.csv',  sep = ',', decimal = '.', error_bad_lines = False)
print(train.shape)
print(y.shape)
print(test.shape)

In [None]:
# check data
train.head()

In [None]:
# check data
test.head()

# 3. MERGER

In [None]:
# target variable
train['duration'] = y['duration']
test['duration']  = np.nan

In [None]:
# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# check equalty
train.columns == test.columns

In [None]:
# concatenate
df = pd.concat([train, test], axis = 0)
df = df.reset_index(drop = True)
del train, test
print(df.shape)

In [None]:
# check distributions
df.describe()

# 4. PROCESSING

### MISSING VALUES

In [None]:
# check missings
count_missings(df)

In [None]:
# fill imputed cases with 0
nas = ['image_url']
for var in nas:
    df[var].fillna(0, inplace = True)

### VARIABLE TYPES

In [None]:
# check data types
df.dtypes

In [None]:
# convert float to integers
to_int = []
for col in to_int:
    df[col] = df[col].astype('int64')
    
# convert boolean to objects
bools = []
for var in bools:
    df[var] = df[var].astype('object')
    
# convert string to dates
dates = []
for var in dates:
    df[var] = pd.to_datetime(df[var].astype('str'), infer_datetime_format = True)

# 5. WORKING WITH FEATURES

### EXTRACT FEATURES FROM TEXT

In [None]:
# compute features
text_vars = ['product_description', 'product_name', 'store_name','sub_category_1', 'sub_category_2', 'sub_category_3', 'sub_category_4']
print(df.shape)
df = add_text_features(df, strings = text_vars, k = 5, keep = True)
print(df.shape)

### DROP IRRELEVANT FEATURES

In [None]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

In [None]:
# remove irrelevant columns
print(df.shape)
drops = ['image_url']
for var in drops:
    del df[var]
print(df.shape)

### CREATE NEW FEATURES

In [None]:
df['pixels'] = df['image_height'] * df['image_width']

### ENCODE FACTORS

In [None]:
# convert categorical features
df = encode_factors(df, method = 'label')
df.head()

# 6. EXPORT

In [None]:
# export CSV
df.to_csv('../data/data_v2.csv', index = False)
df.shape