# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc

In [None]:
########## HELPER FUNCTIONS

!pip install --upgrade dptools
from dptools import *

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
infos  = pd.read_csv('../data/raw/infos.csv',  sep = '|')
items  = pd.read_csv('../data/raw/items.csv',  sep = '|')
orders = pd.read_csv('../data/raw/orders.csv', sep = '|')

print(infos.shape)
print(items.shape)
print(orders.shape)

In [None]:
infos.head()

In [None]:
items.head()

In [None]:
orders.head()

# PROCESSING

### MERGE INFOS AND ITEMS

In [None]:
print(infos.shape)
print(items.shape)
items = pd.merge(infos, items, on = 'itemID', how = 'left')
print(items.shape)
del infos

### CONVERT FEATURE TYPES

In [None]:
print('-' * 50)
print(items.dtypes)
print('-' * 50)
print(orders.dtypes)
print('-' * 50)

In [None]:
# items
for var in ['itemID', 'brand', 'manufacturer', 'category1', 'category2', 'category3']:
    items[var] = items[var].astype('str').astype('object') 
    
# orders
for var in ['transactID', 'itemID']:
    orders[var] = orders[var].astype('str').astype('object') 
    
# dates
orders['time'] = pd.to_datetime(orders['time'].astype('str'), infer_datetime_format = True)

### CHECK FEATURES

In [None]:
print_factor_levels(items, top = 3)

In [None]:
print_factor_levels(orders, top = 3)

In [None]:
find_constant_features(items)

In [None]:
find_constant_features(orders)

### MISSING VALUES

In [None]:
# change zeros to NA where relvant
items.loc[items['brand']          == '0', 'brand']        = np.nan
items.loc[items['customerRating'] == 0, 'customerRating'] = np.nan

In [None]:
print_missings(items)

In [None]:
print_missings(orders)

### UNFOLD PROMOTIONS

In [None]:
# split promotion feature
items = split_nested_features(items, split_vars = 'promotion', sep = ',')
items.head()

In [None]:
# convert date types
promotion_vars = items.filter(like = 'promotion_').columns
for var in promotion_vars:
    items[var] = pd.to_datetime(items[var], infer_datetime_format = True)
    
items.dtypes

# EXPORT

In [None]:
# save data frame
# save_csv_version() automatically adds version number to prevent overwriting
save_csv_version('../data/prepared/orders.csv', orders, index = False, compression = 'gzip')
save_csv_version('../data/prepared/items.csv',  items,  index = False, compression = 'gzip')
print(orders.shape)
print(items.shape)