# Null Values Imputation

## Load dataset

In [1]:
import pandas as pd
df_electronics = pd.read_csv('electronics.csv')
df_modcloth = pd.read_csv('modcloth.csv')

## Impute the data

For the prototype, we will drop the features with high null value percentage, and in others replace the null values with another feature value like "Unknown Size" for `size` feature.

In [4]:
def null_percentage(df):
    total_rows = df.shape[0]
    for c in df.columns:
        null_count = df[c].isna().sum()
        print("Column : ",c," , Null values percentage : ",(null_count/total_rows)*100)

### Electronics dataset

In [5]:
null_percentage(df_electronics)

Column :  item_id  , Null values percentage :  0.0
Column :  user_id  , Null values percentage :  0.0
Column :  rating  , Null values percentage :  0.0
Column :  timestamp  , Null values percentage :  0.0
Column :  model_attr  , Null values percentage :  0.0
Column :  category  , Null values percentage :  0.0
Column :  brand  , Null values percentage :  74.39042688293628
Column :  year  , Null values percentage :  0.0
Column :  user_attr  , Null values percentage :  86.53285422373882
Column :  split  , Null values percentage :  0.0


In [2]:
# electronics.csv
df_electronics_cleaned = pd.DataFrame()

# init columns that have no null values
df_electronics_cleaned['item_id'] = df_electronics['item_id']
df_electronics_cleaned['user_id'] = df_electronics['user_id']
df_electronics_cleaned['rating'] = df_electronics['rating']
df_electronics_cleaned['timestamp'] = df_electronics['timestamp']
df_electronics_cleaned['model_attr'] = df_electronics['model_attr']
df_electronics_cleaned['category'] = df_electronics['category']
df_electronics_cleaned['year'] = df_electronics['year']
df_electronics_cleaned['split'] = df_electronics['split']

# column name : brand => drop for now
#df_electronics_cleaned['brand'] = df_electronics['brand'].fillna("Unknown Brand")

# column name : user_attr => drop for now
#df_electronics_cleaned['user_attr'] = df_electronics['user_attr'].fillna("Unknown Gender")

In [5]:
null_percentage(df_electronics_cleaned)

Column :  item_id  , Null values percentage :  0.0
Column :  user_id  , Null values percentage :  0.0
Column :  rating  , Null values percentage :  0.0
Column :  timestamp  , Null values percentage :  0.0
Column :  model_attr  , Null values percentage :  0.0
Column :  category  , Null values percentage :  0.0
Column :  year  , Null values percentage :  0.0
Column :  split  , Null values percentage :  0.0


### Clothing dataset

In [9]:
null_percentage(df_modcloth)

Column :  item_id  , Null values percentage :  0.0
Column :  user_id  , Null values percentage :  0.0010010711461263552
Column :  rating  , Null values percentage :  0.0
Column :  timestamp  , Null values percentage :  0.0
Column :  size  , Null values percentage :  21.78330813970949
Column :  fit  , Null values percentage :  18.52582263021433
Column :  user_attr  , Null values percentage :  8.375962279639214
Column :  model_attr  , Null values percentage :  0.0
Column :  category  , Null values percentage :  0.0
Column :  brand  , Null values percentage :  74.05924339042775
Column :  year  , Null values percentage :  0.0
Column :  split  , Null values percentage :  0.0


In [6]:
# modcloth.csv
df_modcloth_cleaned = pd.DataFrame()

# init columns that have no null values
df_modcloth_cleaned['item_id'] = df_modcloth['item_id']
df_modcloth_cleaned['user_id'] = df_modcloth['user_id']
df_modcloth_cleaned['rating'] = df_modcloth['rating']
df_modcloth_cleaned['timestamp'] = df_modcloth['timestamp']
df_modcloth_cleaned['model_attr'] = df_modcloth['model_attr']
df_modcloth_cleaned['category'] = df_modcloth['category']
df_modcloth_cleaned['year'] = df_modcloth['year']
df_modcloth_cleaned['split'] = df_modcloth['split']

# column name : size => replace null value with a new value
df_modcloth_cleaned['size'] = df_modcloth['size'].fillna("Unknown Size")

# column name : fit => replace null value with a new value
df_modcloth_cleaned['fit'] = df_modcloth['fit'].fillna("Unknown Fit")

# column name : user_attr => replace null value with a new value
df_modcloth_cleaned['user_attr'] = df_modcloth['user_attr'].fillna("Unknown User Attribute")

# column name : brand => drop for now
#df_modcloth_cleaned['brand'] = df_modcloth['brand'].fillna("Unknown Brand")

# user id => drop the records with null value of user_id
df_modcloth_cleaned.dropna(inplace = True)

In [7]:
null_percentage(df_modcloth_cleaned)

Column :  item_id  , Null values percentage :  0.0
Column :  user_id  , Null values percentage :  0.0
Column :  rating  , Null values percentage :  0.0
Column :  timestamp  , Null values percentage :  0.0
Column :  model_attr  , Null values percentage :  0.0
Column :  category  , Null values percentage :  0.0
Column :  year  , Null values percentage :  0.0
Column :  split  , Null values percentage :  0.0
Column :  size  , Null values percentage :  0.0
Column :  fit  , Null values percentage :  0.0
Column :  user_attr  , Null values percentage :  0.0


## Save datasets

In [8]:
# save cleaned datasets to CSV
df_electronics_cleaned.to_csv('electronics_nonull.csv',index=False)
df_modcloth_cleaned.to_csv('modcloth_nonull.csv',index=False)