In [21]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('WCE-CR.csv',  index_col=0)

data.columns = data.columns.str.lower().str.replace(' ', '_')

data = data.applymap(
    lambda x: x.lower() if (isinstance(x, str)) else x
)

In [None]:
data.head(3)

Unnamed: 0,clothing_id,age,title,review_text,rating,recommended_ind,positive_feedback_count,division_name,department_name,class_name
0,767,33,,absolutely wonderful - silky and sexy and comf...,4,1,0,initmates,intimate,intimates
1,1080,34,,love this dress! it's sooo pretty. i happene...,5,1,4,general,dresses,dresses
2,1077,60,some major design flaws,i had such high hopes for this dress and reall...,3,0,0,general,dresses,dresses


In [None]:
data.shape

(23486, 10)

In [None]:
## verify for null values
data.isnull().sum()

clothing_id                   0
age                           0
title                      3810
review_text                 845
rating                        0
recommended_ind               0
positive_feedback_count       0
division_name                14
department_name              14
class_name                   14
dtype: int64

In [None]:
# considering that we use the review, we drop title and all null values
data.drop(columns='title', inplace=True)
data.dropna(inplace=True)
print(data.shape)
data.isnull().sum()

(22628, 9)


clothing_id                0
age                        0
review_text                0
rating                     0
recommended_ind            0
positive_feedback_count    0
division_name              0
department_name            0
class_name                 0
dtype: int64

## setting the train-val-test dataset

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_full_train, df_test = train_test_split(data, test_size=0.20, random_state=1)

In [25]:
## define test size to be the 20% of the full df, you can do it by
# 20%/80% = 1/4
df_train, df_val = train_test_split(
    df_full_train, test_size=0.25, random_state=1
)

In [27]:
len(df_train), len(df_val), len(df_test)

(13576, 4526, 4526)

In [28]:
# avoid the random index
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [29]:
## get the rarget variable
y_train = df_train.recommended_ind.values
y_val = df_val.recommended_ind.values
y_test = df_test.recommended_ind.values

In [30]:
del df_train['recommended_ind']
del df_val['recommended_ind']
del df_test['recommended_ind']

## EDA

In [31]:
df_full_train.reset_index(drop=True, inplace=True)

In [32]:
df_full_train.recommended_ind.value_counts(normalize=True)

1    0.819909
0    0.180091
Name: recommended_ind, dtype: float64

In [None]:
## the problem is that the dataset is imbalanced, we work a bit with this
from imblearn.over_sampling import RandomOverSampler

In [36]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(df_train, y_train)

In [40]:
np.unique(y_train_resampled, return_counts=True)

(array([0, 1]), array([11119, 11119]))

In [46]:
X_train_resampled[X_train_resampled.review_text.duplicated(keep=False)].sort_values(by='clothing_id')

Unnamed: 0,clothing_id,age,review_text,rating,positive_feedback_count,division_name,department_name,class_name
13584,8,38,"this suit was high quality, and a cute design....",1,0,initmates,intimate,swim
16823,8,38,"this suit was high quality, and a cute design....",1,0,initmates,intimate,swim
18767,8,70,"the suit is lovely, but it is very long in the...",2,0,initmates,intimate,swim
10562,8,70,"the suit is lovely, but it is very long in the...",2,0,initmates,intimate,swim
17292,8,38,"this suit was high quality, and a cute design....",1,0,initmates,intimate,swim
...,...,...,...,...,...,...,...,...
18010,1203,29,"great material, and i like the idea of a scoop...",4,0,initmates,intimate,layering
17099,1203,29,"great material, and i like the idea of a scoop...",4,0,initmates,intimate,layering
19077,1203,29,"great material, and i like the idea of a scoop...",4,0,initmates,intimate,layering
15758,1203,29,"great material, and i like the idea of a scoop...",4,0,initmates,intimate,layering


In [None]:
## next steeps: identify the categorical and numerical vars
# feature importance
# one hot encoding