In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('/content/products.csv').drop(['index', 'price'], axis=1)
df.head()

Unnamed: 0,product,user_id,age,date,review,ratings,repurchase,packaging
0,Blush - Orgasm,Leahmoriah4,44-55,a month ago,"I'm 48, Caucasian with olive undertones so tru...",5.0,65%,4.1
1,Blush - Orgasm,Avalonian1172,30-35,2 months ago,I have to mention that the case is fabulous. A...,4.0,65%,4.1
2,Blush - Orgasm,summerseabreeze,30-35,4 months ago,A coral-pink peach blush with a very fine gold...,5.0,65%,4.1
3,Blush - Orgasm,miransees,30-35,5 months ago,"I'm not sure if it's a ""Universal"" color. It's...",3.0,65%,4.1
4,Blush - Orgasm,outofherhead,56 & Over,6 months ago,The blush that works for everyone does not wor...,2.0,65%,4.1


Data preprocessing

In [3]:
## drop data from more than 13 years ago
print(df.date.value_counts())
drop_years = ['14 years ago', '15 years ago', '16 years ago', '17 years ago', '18 years ago', '19 years ago', '20 years ago' ]
reviews = df[~df['date'].isin(drop_years)].dropna().reset_index().drop(['index'], axis=1)
reviews

11 years ago     1281
12 years ago     1248
10 years ago     1143
13 years ago      860
9 years ago       759
17 years ago      759
16 years ago      746
14 years ago      689
15 years ago      654
18 years ago      575
8 years ago       537
7 years ago       368
6 years ago       238
5 years ago       134
19 years ago      121
4 years ago        98
3 years ago        52
2 years ago        33
20 years ago       20
a year ago         17
6 months ago        4
10 months ago       3
9 months ago        2
7 months ago        2
a month ago         2
2 months ago        1
5 months ago        1
4 months ago        1
Name: date, dtype: int64


Unnamed: 0,product,user_id,age,date,review,ratings,repurchase,packaging
0,Blush - Orgasm,Leahmoriah4,44-55,a month ago,"I'm 48, Caucasian with olive undertones so tru...",5.0,65%,4.1
1,Blush - Orgasm,Avalonian1172,30-35,2 months ago,I have to mention that the case is fabulous. A...,4.0,65%,4.1
2,Blush - Orgasm,summerseabreeze,30-35,4 months ago,A coral-pink peach blush with a very fine gold...,5.0,65%,4.1
3,Blush - Orgasm,miransees,30-35,5 months ago,"I'm not sure if it's a ""Universal"" color. It's...",3.0,65%,4.1
4,Blush - Orgasm,outofherhead,56 & Over,6 months ago,The blush that works for everyone does not wor...,2.0,65%,4.1
...,...,...,...,...,...,...,...,...
6706,Blush - Torrid,Steph0891,25-29,13 years ago,wow this blush is GORGEOUS! i like this alot m...,5.0,86%,4.2
6707,Blush - Torrid,MissPurple,19-24,13 years ago,I think I even like Torrid more than my belove...,5.0,86%,4.2
6708,Blush - Torrid,CraxFactor,25-29,13 years ago,All too often I've flipped over color and igno...,1.0,86%,4.2
6709,Blush - Torrid,gohgoomah,19-24,13 years ago,"this is a beautiful coral shimmer blush, but f...",4.0,86%,4.2


In [4]:
# repurchase(1) if ratings 4/5 else won't repurchase(0)
reviews['repurchase'] = reviews['ratings'].apply(lambda x: 0 if x < 4 else 1 )
reviews.drop(['ratings'], axis=1, inplace=True)
reviews.head()

Unnamed: 0,product,user_id,age,date,review,repurchase,packaging
0,Blush - Orgasm,Leahmoriah4,44-55,a month ago,"I'm 48, Caucasian with olive undertones so tru...",1,4.1
1,Blush - Orgasm,Avalonian1172,30-35,2 months ago,I have to mention that the case is fabulous. A...,1,4.1
2,Blush - Orgasm,summerseabreeze,30-35,4 months ago,A coral-pink peach blush with a very fine gold...,1,4.1
3,Blush - Orgasm,miransees,30-35,5 months ago,"I'm not sure if it's a ""Universal"" color. It's...",0,4.1
4,Blush - Orgasm,outofherhead,56 & Over,6 months ago,The blush that works for everyone does not wor...,0,4.1


In [5]:
# need to account for imbalance??
reviews.repurchase.value_counts()

1    5078
0    1633
Name: repurchase, dtype: int64

Text Preprocessing

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# mentioned in proposal "by converting them to lowercase, removing any links, stopwords, symbols or digits, followed by stemming or lemmatizing"
# common words removal? spelling correction? contraction expansion? check if legit word?

## convert to lowercase
reviews['review_processed'] = reviews['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## removing punctuations
reviews['review_processed'] = reviews['review_processed'].str.replace('[^\w\s]','')

## removing digits
import re
reviews['review_processed'] = reviews['review_processed'].apply(lambda x: re.sub(r'\d+', '', x))

## remove website links 
reviews['review_processed'] = reviews['review_processed'].apply(lambda x: re.sub(r'http\S+', '', x))

## remove leading and ending whitespace (if any)
reviews['review_processed'] = reviews['review_processed'].apply(lambda x: x.strip())

## removing stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
reviews['review_processed'] = reviews['review_processed'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

## top 10 common words - remove??
## pd.Series(' '.join(reviews['review_processed']).split()).value_counts()[:10]

## stemming
# from nltk.stem import PorterStemmer
# st = PorterStemmer()
# reviews['review_processed'] = reviews['review_processed'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

## lemmatization
from textblob import Word
reviews['review_processed'] = reviews['review_processed'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

reviews.head()

Unnamed: 0,product,user_id,age,date,review,repurchase,packaging,review_processed
0,Blush - Orgasm,Leahmoriah4,44-55,a month ago,"I'm 48, Caucasian with olive undertones so tru...",1,4.1,im caucasian olive undertone true pink mauve l...
1,Blush - Orgasm,Avalonian1172,30-35,2 months ago,I have to mention that the case is fabulous. A...,1,4.1,mention case fabulous blush great case break t...
2,Blush - Orgasm,summerseabreeze,30-35,4 months ago,A coral-pink peach blush with a very fine gold...,1,4.1,coralpink peach blush fine gold shimmer medium...
3,Blush - Orgasm,miransees,30-35,5 months ago,"I'm not sure if it's a ""Universal"" color. It's...",0,4.1,im sure universal color probably light make sk...
4,Blush - Orgasm,outofherhead,56 & Over,6 months ago,The blush that works for everyone does not wor...,0,4.1,blush work everyone work got mini size dont kn...


In [11]:
pd.set_option('display.max_colwidth', 500)
reviews['review_processed'][0]
# pd.reset_option('display.max_colwidth')

'im caucasian olive undertone true pink mauve look terrible coral perfect love blush love shimmer cheek bone temple use'

Text Vectorization
- Bag-of-Words
- TF-IDF
- Word embeddings