# Train and Test Preparation

## Load Data

In [1]:
import pandas as pd
df_electronics = pd.read_csv('electronics_nonull.csv')
df_modcloth = pd.read_csv('modcloth_nonull.csv')

# sort both the dataframes in the order of timestamp
df_electronics.sort_values('timestamp',inplace=True)
df_modcloth.sort_values('timestamp',inplace=True)

In [3]:
print("list of columns in electronics.csv")
print(df_electronics.columns)
print("number of rows in electronics.csv",df_electronics.shape[0])
print("top 5 rows")
df_electronics.head()

list of columns in electronics.csv
Index(['item_id', 'user_id', 'rating', 'timestamp', 'model_attr', 'category',
       'year', 'split', 'item_id_count', 'user_id_count'],
      dtype='object')
number of rows in electronics.csv 1292954
top 5 rows


Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,year,split,item_id_count,user_id_count
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,1999,0,118,1
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,1999,0,118,1
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,1999,0,118,1
3,0,3,1.0,1999-07-01,Female,Portable Audio & Video,1999,0,118,1
4,0,4,2.0,1999-07-06,Female,Portable Audio & Video,1999,0,118,1


In [4]:
print("list of columns in modcloth.csv")
print(df_modcloth.columns)
print("number of rows in modcloth.csv",df_modcloth.shape[0])
print("top 5 rows")
df_modcloth.head()

list of columns in modcloth.csv
Index(['item_id', 'user_id', 'rating', 'timestamp', 'model_attr', 'category',
       'year', 'split', 'size', 'fit', 'user_attr', 'item_id_count',
       'user_id_count'],
      dtype='object')
number of rows in modcloth.csv 99892
top 5 rows


Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,year,split,size,fit,user_attr,item_id_count,user_id_count
0,7443,Alex,4,2010-01-21 08:00:00+00:00,Small,Dresses,2012,0,Unknown Size,Unknown Fit,Small,1011,66
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,Small,Dresses,2012,0,Unknown Size,Unknown Fit,Unknown User Attribute,1011,1
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,Small,Dresses,2012,0,Unknown Size,Unknown Fit,Small,1011,30
3,7443,De,4,2010-02-13 08:00:00+00:00,Small,Dresses,2012,0,Unknown Size,Unknown Fit,Unknown User Attribute,1011,1
4,7443,tasha,4,2010-02-18 08:00:00+00:00,Small,Dresses,2012,0,Unknown Size,Unknown Fit,Small,1011,12


## Find the count of user_ids and item_ids

In [2]:
# we need to find the item_id and/or user_id that occur only once
# such data should be always kept in training data set

# find unique counts of item_id and user_id
# code borrowed from
#https://stackoverflow.com/questions/29791785/python-pandas-add-a-column-to-my-dataframe-that-counts-a-variable

# electronics.csv

df_electronics['item_id_count'] = df_electronics.groupby('item_id')['item_id'].transform('count')
df_electronics['user_id_count'] = df_electronics.groupby('user_id')['user_id'].transform('count')

# modcloth## Find the count of user_ids and item_ids.csv

df_modcloth['item_id_count'] = df_modcloth.groupby('item_id')['item_id'].transform('count')
df_modcloth['user_id_count'] = df_modcloth.groupby('user_id')['user_id'].transform('count')

## Check which values are numeric

In [5]:
# check if values in a column are numeric

def numeric_check(df):
    for c in df.columns:
        # below check for numeric borrowed from
        #https://stackoverflow.com/questions/54426845/how-to-check-if-a-pandas-dataframe-contains-only-numeric-column-wise/54427157
        is_numeric = pd.to_numeric(df[c], errors='coerce').notnull().all()
        print("Column : ",c," , Is numeric? : ",is_numeric)
        
# electronics.csv
print('electronics.csv')
numeric_check(df_electronics)

print()

# modcloth.csv
print('modcloth.csv')
numeric_check(df_modcloth)

electronics.csv
Column :  item_id  , Is numeric? :  True
Column :  user_id  , Is numeric? :  True
Column :  rating  , Is numeric? :  True
Column :  timestamp  , Is numeric? :  False
Column :  model_attr  , Is numeric? :  False
Column :  category  , Is numeric? :  False
Column :  year  , Is numeric? :  True
Column :  split  , Is numeric? :  True
Column :  item_id_count  , Is numeric? :  True
Column :  user_id_count  , Is numeric? :  True

modcloth.csv
Column :  item_id  , Is numeric? :  True
Column :  user_id  , Is numeric? :  False
Column :  rating  , Is numeric? :  True
Column :  timestamp  , Is numeric? :  False
Column :  model_attr  , Is numeric? :  False
Column :  category  , Is numeric? :  False
Column :  year  , Is numeric? :  True
Column :  split  , Is numeric? :  True
Column :  size  , Is numeric? :  False
Column :  fit  , Is numeric? :  False
Column :  user_attr  , Is numeric? :  False
Column :  item_id_count  , Is numeric? :  True
Column :  user_id_count  , Is numeric? :  Tru

## Split into training and test data set

In [50]:
# split into train and test

from sklearn.model_selection import train_test_split

# electronics.csv

# do a split
df_electronics_train = pd.DataFrame()
df_electronics_test = pd.DataFrame()
df_electronics_train,df_electronics_test = train_test_split(df_electronics,test_size=0.5,shuffle = False)


# find the userids that are in test but not train
tr = set(list(df_electronics_train.user_id))
te = set(list(df_electronics_test.user_id))
missing_user_id = list(set.difference(te,tr))

# get the df for these userids
df_test_missing_user_id = df_electronics_test[df_electronics_test['user_id'].isin(missing_user_id)]
# append to training dataset
df_electronics_train = pd.concat([df_electronics_train, df_test_missing_user_id],
                                  ignore_index = True)
#  and delete from test dataset
df_electronics_test = df_electronics_test[~df_electronics_test['user_id'].isin(missing_user_id)]


# find the itemids that are in test but not train
tr = set(list(df_electronics_train.item_id))
te = set(list(df_electronics_test.item_id))
missing_item_id = list(set.difference(te,tr))

# get the df for these itemids
df_test_missing_item_id = df_electronics_test[df_electronics_test['item_id'].isin(missing_item_id)]
# append to training dataset
df_electronics_train = pd.concat([df_electronics_train, df_test_missing_item_id],
                                 ignore_index = True)
#  and delete from test dataset
df_electronics_test = df_electronics_test[~df_electronics_test['item_id'].isin(missing_item_id)]

print("electronics.csv")
print("Size of training dataset: ",df_electronics_train.shape[0])
print("Size of testing dataset: ",df_electronics_test.shape[0])
print("Effective train ratio: ",df_electronics_train.shape[0]/df_electronics.shape[0])
print("Effective test ratio: ",df_electronics_test.shape[0]/df_electronics.shape[0])

print()

# modcloth.csv

# do a split
df_modcloth_train = pd.DataFrame()
df_modcloth_test = pd.DataFrame()
df_modcloth_train,df_modcloth_test = train_test_split(df_modcloth,test_size=0.125,shuffle = False)


# find the userids that are in test but not train
tr = set(list(df_modcloth_train.user_id))
te = set(list(df_modcloth_test.user_id))
missing_user_id = list(set.difference(te,tr))

# get the df for these userids
df_test_missing_user_id = df_modcloth_test[df_modcloth_test['user_id'].isin(missing_user_id)]
# append to training dataset
df_modcloth_train = pd.concat([df_modcloth_train, df_test_missing_user_id],
                                  ignore_index = True)
#  and delete from test dataset
df_modcloth_test = df_modcloth_test[~df_modcloth_test['user_id'].isin(missing_user_id)]


# find the itemids that are in test but not train
tr = set(list(df_modcloth_train.item_id))
te = set(list(df_modcloth_test.item_id))
missing_item_id = list(set.difference(te,tr))

# get the df for these itemids
df_test_missing_item_id = df_modcloth_test[df_modcloth_test['item_id'].isin(missing_item_id)]
# append to training dataset
df_modcloth_train = pd.concat([df_modcloth_train, df_test_missing_item_id],
                                 ignore_index = True)
#  and delete from test dataset
df_modcloth_test = df_modcloth_test[~df_modcloth_test['item_id'].isin(missing_item_id)]

print("modcloth.csv")
print("Size of training dataset: ",df_modcloth_train.shape[0])
print("Size of testing dataset: ",df_modcloth_test.shape[0])
print("Effective train ratio: ",df_modcloth_train.shape[0]/df_modcloth.shape[0])
print("Effective test ratio: ",df_modcloth_test.shape[0]/df_modcloth.shape[0])

electronics.csv
Size of training dataset:  1251918
Size of testing dataset:  41036
Effective train ratio:  0.9682618252466831
Effective test ratio:  0.031738174753316827

modcloth.csv
Size of training dataset:  92133
Size of testing dataset:  7759
Effective train ratio:  0.9223261122011772
Effective test ratio:  0.07767388779882273


## Encode the categorical data and scale the scalar data

In [60]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessin## Split into training and test data setg import OrdinalEncoder
import numpy as np
## Split into training and test data set
def one_hot_encode(col_2Darray,ohe_pretrained=None):
      
    if ohe_pretrained == None:
        # init one hot encoder
        ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)        
        # fit one hot encoder and transform
        col_2Darray_ohe = ohe.fit_transform(col_2Darray)
        # get feature names
        col_2Darray_feat_names = ohe.get_feature_names()
        # return
        return ohe,col_2Darray_ohe,col_2Darray_feat_names
        
    else:
        # use pretrained one hot encoder
        ohe = ohe_pretrained
        # transform
        col_2Darray_ohe = ohe.transform(col_2Darray)
        # get feature names
        col_2Darray_feat_names = ohe.get_feature_names()   
        # return
        return col_2Darray_ohe,col_2Darray_feat_names


def standard_scale(col_2Darray,scaler_pretrained=None):

    if scaler_pretrained == None:
        # init scaler
        scaler = StandardScaler()
        # fit scaler and transform
        col_2Darray_scaled = scaler.fit_transform(col_2Darray)
        # return
        return scaler,col_2Darray_scaled
        
    else:
        # use pretrained scaler
        scaler = scaler_pretrained
        # transform
        col_2Darray_scaled = scaler.transform(col_2Darray)         
        # return
        return col_2Darray_scaled


def ordinal_encode(col_2Darray,ordenc_pretrained=None):
      
    if ordenc_pretrained == None:
        # init ordinal encoder
        ordenc = OrdinalEncoder(dtype=np.int32)
        # fit one hot encoder and transform
        col_2Darray_ordenc = ordenc.fit_transform(col_2Darray)
        # return
        return ordenc,col_2Darray_ordenc
        
    else:
        # use pretrained ordinal encoder
        ordenc = ordenc_pretrained
        # transform
        col_2Darray_ordenc = ordenc.transform(col_2Darray)          
        # return
        return col_2Darray_ordenc


### Electronics dataset

In [54]:
# convert the categorical data to numerical data
# and scale numerical data wherever needed

# electronics.csv - training dataset
df_electronics_train_numonly = pd.DataFrame()

# directly copy fields that are numeric
df_electronics_train_numonly['item_id'] = df_electronics_train['item_id']
df_electronics_train_numonly['user_id'] = df_electronics_train['user_id']
df_electronics_train_numonly['rating'] = df_electronics_train['rating']
# timestamp => not needed after ordering and split
#df_electronics_train_numonly['timestamp'] = df_electronics_train_numonly['timestamp']

# model_attr => convert to one hot encoding
model_attr_ohe, model_attr_ohe_values, model_attr_ohe_feat_names \
= one_hot_encode(df_electronics_train['model_attr'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'model_attr_<feat_name>'
for ind,feat_name in enumerate(model_attr_ohe_feat_names):
    col_name = 'model_attr_' + str(feat_name)
    df_electronics_train_numonly[col_name] = model_attr_ohe_values[:,ind]

# category => convert to one hot encoding
category_ohe, category_ohe_values, category_ohe_feat_names \
= one_hot_encode(df_electronics_train['category'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'category_<feat_name>'
for ind,feat_name in enumerate(category_ohe_feat_names):
    col_name = 'category_' + str(feat_name)
    df_electronics_train_numonly[col_name] = category_ohe_values[:,ind]    
    
# year => standardize to have 0 mean and variance as 1
year_scaler,year_scaled = standard_scale(df_electronics_train['year'].to_numpy().reshape(-1,1))
df_electronics_train_numonly['year'] = year_scaled
df_electronics_train_numonly['split'] = df_electronics_train['split']
df_electronics_train_numonly['user_id_count'] = df_electronics_train['user_id_count']
df_electronics_train_numonly['item_id_count'] = df_electronics_train['item_id_count']



In [55]:
# convert the categorical data to numerical data
# and scale numerical data wherever needed

# electronics.csv - testing dataset
df_electronics_test_numonly = pd.DataFrame()

# directly copy fields that are numeric
df_electronics_test_numonly['item_id'] = df_electronics_test['item_id']
df_electronics_test_numonly['user_id'] = df_electronics_test['user_id']
df_electronics_test_numonly['rating'] = df_electronics_test['rating']
# timestamp => not needed after ordering and split
#df_electronics_test_numonly['timestamp'] = df_electronics_test_numonly['timestamp']

# model_attr => convert to one hot encoding
model_attr_ohe_values, model_attr_ohe_feat_names \
= one_hot_encode(df_electronics_test['model_attr'].to_numpy().reshape(-1,1),
                 ohe_pretrained = model_attr_ohe)
# assign each name to each hot encoded column as 'model_attr_<feat_name>'
for ind,feat_name in enumerate(model_attr_ohe_feat_names):
    col_name = 'model_attr_' + str(feat_name)
    df_electronics_test_numonly[col_name] = model_attr_ohe_values[:,ind]

# category => convert to one hot encoding
category_ohe_values, category_ohe_feat_names \
= one_hot_encode(df_electronics_test['category'].to_numpy().reshape(-1,1),
                ohe_pretrained = category_ohe)
# assign each name to each hot encoded column as 'category_<feat_name>'
for ind,feat_name in enumerate(category_ohe_feat_names):
    col_name = 'category_' + str(feat_name)
    df_electronics_test_numonly[col_name] = category_ohe_values[:,ind]    
    
# year => standardize to have 0 mean and variance as 1
year_scaled = standard_scale(df_electronics_test['year'].to_numpy().reshape(-1,1),
                                        scaler_pretrained=year_scaler)
df_electronics_test_numonly['year'] = year_scaled
df_electronics_test_numonly['split'] = df_electronics_test['split']
df_electronics_test_numonly['user_id_count'] = df_electronics_test['user_id_count']
df_electronics_test_numonly['item_id_count'] = df_electronics_test['item_id_count']



In [56]:
print("list of columns in df_electronics_train_numonly")
print(df_electronics_train_numonly.columns)
print("number of rows in df_electronics_train_numonly",df_electronics_train_numonly.shape[0])
print("number of columns in df_electronics_train_numonly",df_electronics_train_numonly.shape[1])
print("top 5 rows")
df_electronics_train_numonly.head()

list of columns in df_electronics_train_numonly
Index(['item_id', 'user_id', 'rating', 'model_attr_x0_Female',
       'model_attr_x0_Female&Male', 'model_attr_x0_Male',
       'category_x0_Accessories & Supplies', 'category_x0_Camera & Photo',
       'category_x0_Car Electronics & GPS',
       'category_x0_Computers & Accessories', 'category_x0_Headphones',
       'category_x0_Home Audio', 'category_x0_Portable Audio & Video',
       'category_x0_Security & Surveillance', 'category_x0_Television & Video',
       'category_x0_Wearable Technology', 'year', 'split', 'user_id_count',
       'item_id_count'],
      dtype='object')
number of rows in df_electronics_train_numonly 1251918
number of columns in df_electronics_train_numonly 20
top 5 rows


Unnamed: 0,item_id,user_id,rating,model_attr_x0_Female,model_attr_x0_Female&Male,model_attr_x0_Male,category_x0_Accessories & Supplies,category_x0_Camera & Photo,category_x0_Car Electronics & GPS,category_x0_Computers & Accessories,category_x0_Headphones,category_x0_Home Audio,category_x0_Portable Audio & Video,category_x0_Security & Surveillance,category_x0_Television & Video,category_x0_Wearable Technology,year,split,user_id_count,item_id_count
0,0,0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-5.247198,0,1,118
1,0,1,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-5.247198,0,1,118
2,0,2,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-5.247198,0,1,118
3,0,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-5.247198,0,1,118
4,0,4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-5.247198,0,1,118


In [57]:
print("list of columns in df_electronics_test_numonly")
print(df_electronics_test_numonly.columns)
print("number of rows in df_electronics_test_numonly",df_electronics_test_numonly.shape[0])
print("number of columns in df_electronics_test_numonly",df_electronics_test_numonly.shape[1])
print("top 5 rows")
df_electronics_test_numonly.head()

list of columns in df_electronics_test_numonly
Index(['item_id', 'user_id', 'rating', 'model_attr_x0_Female',
       'model_attr_x0_Female&Male', 'model_attr_x0_Male',
       'category_x0_Accessories & Supplies', 'category_x0_Camera & Photo',
       'category_x0_Car Electronics & GPS',
       'category_x0_Computers & Accessories', 'category_x0_Headphones',
       'category_x0_Home Audio', 'category_x0_Portable Audio & Video',
       'category_x0_Security & Surveillance', 'category_x0_Television & Video',
       'category_x0_Wearable Technology', 'year', 'split', 'user_id_count',
       'item_id_count'],
      dtype='object')
number of rows in df_electronics_test_numonly 41036
number of columns in df_electronics_test_numonly 20
top 5 rows


Unnamed: 0,item_id,user_id,rating,model_attr_x0_Female,model_attr_x0_Female&Male,model_attr_x0_Male,category_x0_Accessories & Supplies,category_x0_Camera & Photo,category_x0_Car Electronics & GPS,category_x0_Computers & Accessories,category_x0_Headphones,category_x0_Home Audio,category_x0_Portable Audio & Video,category_x0_Security & Surveillance,category_x0_Television & Video,category_x0_Wearable Technology,year,split,user_id_count,item_id_count
647254,4964,412146,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.412227,2,2,155
647311,7256,181973,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.789522,1,4,3688
647354,2340,460569,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.342363,2,2,5790
647375,7256,332161,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.789522,2,2,3688
647352,6759,491112,5.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789522,2,2,486


### Modcloth dataset

In [58]:
# convert the categorical data to numerical data
# and scale numerical data wherever needed

# modcloth.csv - training dataset
df_modcloth_train_numonly = pd.DataFrame()

# directly copy fields that are numeric### Electronics dataset
df_modcloth_train_numonly['item_id'] = df_modcloth_train['item_id']

# user_id => convert from strings to ordinal encoding
user_id_ordenc, user_id_ordenc_values = ordinal_encode(df_modcloth_train['user_id'].to_numpy().reshape(-1,1))
df_modcloth_train_numonly['user_id'] = user_id_ordenc_values

df_modcloth_train_numonly['rating'] = df_modcloth_train['rating']
# timestamp => not needed after ordering and split
#df_modcloth_train_numonly['timestamp'] = df_modcloth_train_numonly['timestamp']

# model_attr => convert to one hot encoding
model_attr_ohe, model_attr_ohe_values, model_attr_ohe_feat_names \
= one_hot_encode(df_modcloth_train['model_attr'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'model_attr_<feat_name>'
for ind,feat_name in enumerate(model_attr_ohe_feat_names):
    col_name = 'model_attr_' + str(feat_name)
    df_modcloth_train_numonly[col_name] = model_attr_ohe_values[:,ind]

# category => convert to one hot encoding
category_ohe, category_ohe_values, category_ohe_feat_names \
= one_hot_encode(df_modcloth_train['category'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'category_<feat_name>'
for ind,feat_name in enumerate(category_ohe_feat_names):
    col_name = 'category_' + str(feat_name)
    df_modcloth_train_numonly[col_name] = category_ohe_values[:,ind]    
    
# year => standardize to have 0 mean and variance as 1
year_scaler,year_scaled = standard_scale(df_modcloth_train['year'].to_numpy().reshape(-1,1))
df_modcloth_train_numonly['year'] = year_scaled
df_modcloth_train_numonly['split'] = df_modcloth_train['split']

# size => convert to one hot encoding
size_ohe, size_ohe_values, size_ohe_feat_names \
= one_hot_encode(df_modcloth_train['size'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'size_<feat_name>'
for ind,feat_name in enumerate(size_ohe_feat_names):
    col_name = 'size_' + str(feat_name)
    df_modcloth_train_numonly[col_name] = size_ohe_values[:,ind]    

# fit => convert to one hot encoding
fit_ohe, fit_ohe_values, fit_ohe_feat_names \
= one_hot_encode(df_modcloth_train['fit'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'fit_<feat_name>'
for ind,feat_name in enumerate(fit_ohe_feat_names):
    col_name = 'fit_' + str(feat_name)
    df_modcloth_train_numonly[col_name] = fit_ohe_values[:,ind]    

# user_attr => convert to one hot encoding
user_attr_ohe, user_attr_ohe_values, user_attr_ohe_feat_names \
= one_hot_encode(df_modcloth_train['user_attr'].to_numpy().reshape(-1,1))
# assign each name to each hot encoded column as 'user_attr_<feat_name>'
for ind,feat_name in enumerate(user_attr_ohe_feat_names):
    col_name = 'user_attr_' + str(feat_name)
    df_modcloth_train_numonly[col_name] = user_attr_ohe_values[:,ind]    
    
df_modcloth_train_numonly['user_id_count'] = df_modcloth_train['user_id_count']
df_modcloth_train_numonly['item_id_count'] = df_modcloth_train['item_id_count']



In [62]:
# convert the categorical data to numerical data
# and scale numerical data wherever needed

# modcloth.csv - testing dataset
df_modcloth_test_numonly = pd.DataFrame()

# directly copy fields that are numeric
df_modcloth_test_numonly['item_id'] = df_modcloth_test['item_id']

# user_id => convert from strings to ordinal encoding
user_id_ordenc_values = ordinal_encode(df_modcloth_test['user_id'].to_numpy().reshape(-1,1),
                                                      ordenc_pretrained = user_id_ordenc)
df_modcloth_test_numonly['user_id'] = user_id_ordenc_values

df_modcloth_test_numonly['rating'] = df_modcloth_test['rating']
# timestamp => not needed after ordering and split
#df_modcloth_test_numonly['timestamp'] = df_modcloth_test_numonly['timestamp']

# model_attr => convert to one hot encoding
model_attr_ohe_values, model_attr_ohe_feat_names \
= one_hot_encode(df_modcloth_test['model_attr'].to_numpy().reshape(-1,1),
                ohe_pretrained = model_attr_ohe)
# assign each name to each hot encoded column as 'model_attr_<feat_name>'
for ind,feat_name in enumerate(model_attr_ohe_feat_names):
    col_name = 'model_attr_' + str(feat_name)
    df_modcloth_test_numonly[col_name] = model_attr_ohe_values[:,ind]

# category => convert to one hot encoding
category_ohe_values, category_ohe_feat_names \
= one_hot_encode(df_modcloth_test['category'].to_numpy().reshape(-1,1),
                ohe_pretrained = category_ohe)
# assign each name to each hot encoded column as 'category_<feat_name>'
for ind,feat_name in enumerate(category_ohe_feat_names):
    col_name = 'category_' + str(feat_name)
    df_modcloth_test_numonly[col_name] = category_ohe_values[:,ind]    
    
# year => standardize to have 0 mean and variance as 1
year_scaled = standard_scale(df_modcloth_test['year'].to_numpy().reshape(-1,1),
                                        scaler_pretrained = year_scaler)
df_modcloth_test_numonly['year'] = year_scaled
df_modcloth_test_numonly['split'] = df_modcloth_test['split']

# size => convert to one hot encoding
size_ohe_values, size_ohe_feat_names \
= one_hot_encode(df_modcloth_test['size'].to_numpy().reshape(-1,1),
                ohe_pretrained = size_ohe)
# assign each name to each hot encoded column as 'size_<feat_name>'
for ind,feat_name in enumerate(size_ohe_feat_names):
    col_name = 'size_' + str(feat_name)
    df_modcloth_test_numonly[col_name] = size_ohe_values[:,ind]    

# fit => convert to one hot encoding
fit_ohe_values, fit_ohe_feat_names \
= one_hot_encode(df_modcloth_test['fit'].to_numpy().reshape(-1,1),
                ohe_pretrained = fit_ohe)
# assign each name to each hot encoded column as 'fit_<feat_name>'
for ind,feat_name in enumerate(fit_ohe_feat_names):
    col_name = 'fit_' + str(feat_name)
    df_modcloth_test_numonly[col_name] = fit_ohe_values[:,ind]    

# user_attr => convert to one hot encoding
user_attr_ohe_values, user_attr_ohe_feat_names \
= one_hot_encode(df_modcloth_test['user_attr'].to_numpy().reshape(-1,1),
                ohe_pretrained = user_attr_ohe)
# assign each name to each hot encoded column as 'user_attr_<feat_name>'
for ind,feat_name in enumerate(user_attr_ohe_feat_names):
    col_name = 'user_attr_' + str(feat_name)
    df_modcloth_test_numonly[col_name] = user_attr_ohe_values[:,ind]    
    
df_modcloth_test_numonly['user_id_count'] = df_modcloth_test['user_id_count']
df_modcloth_test_numonly['item_id_count'] = df_modcloth_test['item_id_count']



In [63]:
print("list of columns in df_modcloth_train_numonly")
print(df_modcloth_train_numonly.columns)
print("number of rows in df_modcloth_train_numonly",df_modcloth_train_numonly.shape[0])
print("number of columns in df_modcloth_train_numonly",df_modcloth_train_numonly.shape[1])
print("top 5 rows")
df_modcloth_train_numonly.head()

list of columns in df_modcloth_train_numonly
Index(['item_id', 'user_id', 'rating', 'model_attr_x0_Small',
       'model_attr_x0_Small&Large', 'category_x0_Bottoms',
       'category_x0_Dresses', 'category_x0_Outerwear', 'category_x0_Tops',
       'year', 'split', 'size_x0_0.0', 'size_x0_1.0', 'size_x0_2.0',
       'size_x0_3.0', 'size_x0_4.0', 'size_x0_5.0', 'size_x0_6.0',
       'size_x0_7.0', 'size_x0_8.0', 'size_x0_Unknown Size',
       'fit_x0_Just right', 'fit_x0_Slightly large', 'fit_x0_Slightly small',
       'fit_x0_Unknown Fit', 'fit_x0_Very large', 'fit_x0_Very small',
       'user_attr_x0_Large', 'user_attr_x0_Small',
       'user_attr_x0_Unknown User Attribute', 'user_id_count',
       'item_id_count'],
      dtype='object')
number of rows in df_modcloth_train_numonly 92133
number of columns in df_modcloth_train_numonly 32
top 5 rows


Unnamed: 0,item_id,user_id,rating,model_attr_x0_Small,model_attr_x0_Small&Large,category_x0_Bottoms,category_x0_Dresses,category_x0_Outerwear,category_x0_Tops,year,...,fit_x0_Slightly large,fit_x0_Slightly small,fit_x0_Unknown Fit,fit_x0_Very large,fit_x0_Very small,user_attr_x0_Large,user_attr_x0_Small,user_attr_x0_Unknown User Attribute,user_id_count,item_id_count
0,7443,309,4,1.0,0.0,0.0,1.0,0.0,0.0,-1.727919,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,66,1011
1,7443,13009,3,1.0,0.0,0.0,1.0,0.0,0.0,-1.727919,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1011
2,7443,5534,4,1.0,0.0,0.0,1.0,0.0,0.0,-1.727919,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,30,1011
3,7443,1716,4,1.0,0.0,0.0,1.0,0.0,0.0,-1.727919,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1011
4,7443,42071,4,1.0,0.0,0.0,1.0,0.0,0.0,-1.727919,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,12,1011


In [64]:
print("list of columns in df_modcloth_test_numonly")
print(df_modcloth_test_numonly.columns)
print("number of rows in df_modcloth_test_numonly",df_modcloth_test_numonly.shape[0])
print("number of columns in df_modcloth_test_numonly",df_modcloth_test_numonly.shape[1])
print("top 5 rows")
df_modcloth_test_numonly.head()

list of columns in df_modcloth_test_numonly
Index(['item_id', 'user_id', 'rating', 'model_attr_x0_Small',
       'model_attr_x0_Small&Large', 'category_x0_Bottoms',
       'category_x0_Dresses', 'category_x0_Outerwear', 'category_x0_Tops',
       'year', 'split', 'size_x0_0.0', 'size_x0_1.0', 'size_x0_2.0',
       'size_x0_3.0', 'size_x0_4.0', 'size_x0_5.0', 'size_x0_6.0',
       'size_x0_7.0', 'size_x0_8.0', 'size_x0_Unknown Size',
       'fit_x0_Just right', 'fit_x0_Slightly large', 'fit_x0_Slightly small',
       'fit_x0_Unknown Fit', 'fit_x0_Very large', 'fit_x0_Very small',
       'user_attr_x0_Large', 'user_attr_x0_Small',
       'user_attr_x0_Unknown User Attribute', 'user_id_count',
       'item_id_count'],
      dtype='object')
number of rows in df_modcloth_test_numonly 7759
number of columns in df_modcloth_test_numonly 32
top 5 rows


Unnamed: 0,item_id,user_id,rating,model_attr_x0_Small,model_attr_x0_Small&Large,category_x0_Bottoms,category_x0_Dresses,category_x0_Outerwear,category_x0_Tops,year,...,fit_x0_Slightly large,fit_x0_Slightly small,fit_x0_Unknown Fit,fit_x0_Very large,fit_x0_Very small,user_attr_x0_Large,user_attr_x0_Small,user_attr_x0_Unknown User Attribute,user_id_count,item_id_count
87423,80427,37440,3,0.0,1.0,0.0,0.0,0.0,1.0,-0.191453,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8,539
87410,153726,411,5,0.0,1.0,0.0,0.0,0.0,1.0,0.832858,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,204,98
87408,76049,38327,4,0.0,1.0,1.0,0.0,0.0,0.0,0.832858,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,59,573
87396,135555,1139,5,0.0,1.0,0.0,0.0,1.0,0.0,0.320702,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,11,117
87398,69630,37810,1,1.0,0.0,1.0,0.0,0.0,0.0,-0.191453,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,7,256


## Save the data to disk

In [65]:
# transfer data to csv

df_electronics_train_numonly.to_csv('electronics_train.csv',index=False)
df_modcloth_train_numonly.to_csv('modcloth_train.csv',index=False)
### Electronics dataset
df_electronics_test_numonly.to_csv('electronics_test.csv',index=False)
df_modcloth_test_numonly.to_csv('modcloth_test.csv',index=False)