In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [2]:
#Required text pre-processing libraries are imported
import string
import nltk
import re

# download the stopwords and wordnet corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
# import tokenize from nltk library
from nltk import tokenize
# import WordNetLemmatizer from nltk library
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer

#Required data visualisation libraries are imported
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jillian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_csv('./data/Womens Clothing E-Commerce Reviews 2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
#performing same initial data cleaning

In [6]:
df = df[df['Review Text'].isna()==False]

In [7]:
df.drop(columns='Unnamed: 0', inplace=True)

In [8]:
df[df.duplicated() == True]

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
21888,1022,37,,"Love, love these jeans. being short they come ...",5,1,0,General,Bottoms,Jeans


In [9]:
df.drop(21888, inplace=True)

In [10]:
df_text = pd.DataFrame(df['Review Text'])

In [11]:
df_text

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


In [12]:
##adding in polarity and subjectivity of text and seeing how it evolves when text is cleaned

In [13]:
from textblob import TextBlob
df_text['polarity'] = df_text['Review Text'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective'] = df_text['Review Text'].apply(lambda x: TextBlob(x).subjectivity)

In [14]:
df_text

Unnamed: 0,Review Text,polarity,subjective
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000
2,I had such high hopes for this dress and reall...,0.073675,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000
4,This shirt is very flattering to all due to th...,0.512891,0.568750
...,...,...,...
23481,I was very happy to snag this dress at such a ...,0.552667,0.710000
23482,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333
23483,"This fit well, but the top was very see throug...",0.414286,0.596429
23484,I bought this dress for a wedding i have this ...,0.322222,0.577778


In [15]:
df_text['Clean'] = df_text["Review Text"].astype(str)

In [16]:
# Need to remove capitalization, apostrophes, and numbers from the reviews

# Create a tokenizer that will take all words with three or more letters
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)

# Create list of stopwords in English (language of the reviews)
# Remove "very" from the list of stopwords 
stopwords_list = stopwords.words("english")
# Remove "very" from the list of stopwords 
stopwords_list.remove("very")

# Create an instance of nltk's WordNetLemmatizer with the variable name `lemmatizer`
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(text, tokenizer, stopwords_list, lemmatizer):
    # Standardize case (lowercase the text)
    lowered_text = text.lower()
    
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered_text)
    
    # Remove stopwords using `stopwords_list and removing punctuation`
    stopped_tokens = [word for word in tokens if word not in stopwords_list or word not in string.punctuation]
    
    # Stem the tokenized text using `stemmer`
    lemmatized_text = [lemmatizer.lemmatize(token) for token in stopped_tokens]
    
    # Return the preprocessed text
    return lemmatized_text

In [18]:
# Apply the preprocess function to the entire review text
df_text['Clean'] = df_text['Review Text'].apply(lambda x: preprocess_text(x, tokenizer, stopwords_list, lemmatizer))


In [19]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,..."
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, this, dress, it's, sooo, pretty, i, hap..."
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a..."
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[i, love, love, love, this, jumpsuit, it's, fu..."
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[this, shirt, is, very, flattering, to, all, d..."


In [20]:
# create new column from df_clean that converts list of tokens to a string.
df_text['String']=df_text["Clean"].str.join(" ")

In [21]:
df_text['polarity_clean'] = df_text['String'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective_clean'] = df_text['String'].apply(lambda x: TextBlob(x).subjectivity)

In [22]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.39375,0.56875


In [23]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22640 entries, 0 to 23485
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Review Text       22640 non-null  object 
 1   polarity          22640 non-null  float64
 2   subjective        22640 non-null  float64
 3   Clean             22640 non-null  object 
 4   String            22640 non-null  object 
 5   polarity_clean    22640 non-null  float64
 6   subjective_clean  22640 non-null  float64
dtypes: float64(4), object(3)
memory usage: 1.4+ MB


## Bigrams using CountVectorizer

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = c_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [25]:
df_ngram

Unnamed: 0,frequency,bigram
0,7184,in the
1,6174,it is
2,5635,and the
3,4765,this dress
4,4342,on the
...,...,...
1995,80,out with
1996,80,of room
1997,80,is tt
1998,80,color for


In [26]:
df_ngram['polarity'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [27]:
df_ngram

Unnamed: 0,frequency,bigram,polarity,subjective
0,7184,in the,0.0,0.0
1,6174,it is,0.0,0.0
2,5635,and the,0.0,0.0
3,4765,this dress,0.0,0.0
4,4342,on the,0.0,0.0
...,...,...,...,...
1995,80,out with,0.0,0.0
1996,80,of room,0.0,0.0
1997,80,is tt,0.0,0.0
1998,80,color for,0.0,0.0


## Ngrams using TfdifVectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
df_ngram_tvec = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [29]:
df_ngram_tvec

Unnamed: 0,frequency,bigram
0,625.499501,in the
1,598.985126,it is
2,539.402345,and the
3,522.127249,this dress
4,445.547138,on the
...,...,...
1995,18.117305,me than
1996,18.095996,reading the
1997,18.028343,saw the
1998,18.027948,said the


In [30]:
df_ngram_tvec['polarity'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram_tvec['subjective'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [31]:
df_ngram_tvec

Unnamed: 0,frequency,bigram,polarity,subjective
0,625.499501,in the,0.0,0.0
1,598.985126,it is,0.0,0.0
2,539.402345,and the,0.0,0.0
3,522.127249,this dress,0.0,0.0
4,445.547138,on the,0.0,0.0
...,...,...,...,...
1995,18.117305,me than,0.0,0.0
1996,18.095996,reading the,0.0,0.0
1997,18.028343,saw the,0.0,0.0
1998,18.027948,said the,0.0,0.0


In [32]:
df_ngram_tvec.sort_values('polarity')[:20]

Unnamed: 0,frequency,bigram,polarity,subjective
1231,29.40408,very disappointed,-0.975,0.975
1819,21.279894,wa disappointed,-0.75,0.75
1365,26.816153,so disappointed,-0.75,0.75
902,36.626114,too bad,-0.7,0.666667
954,35.252222,very thin,-0.52,1.0
767,40.47719,for casual,-0.5,0.866667
1567,24.146754,but unfortunately,-0.5,1.0
1099,31.750565,unfortunately it,-0.5,1.0
1696,22.723396,difficult to,-0.5,1.0
1406,26.098153,unfortunately the,-0.5,1.0


In [33]:
#20 most "negative" sentiments

In [34]:
df_ngram_tvec.sort_values('polarity')[-25:]

Unnamed: 0,frequency,bigram,polarity,subjective
608,47.341781,beautiful dress,0.85,1.0
1169,30.38752,it beautiful,0.85,1.0
765,40.560074,beautiful color,0.85,1.0
732,42.010214,are beautiful,0.85,1.0
1779,21.788216,very good,0.91,0.78
897,36.754921,perfect the,1.0,1.0
633,46.00546,very happy,1.0,1.0
1353,26.997856,perfect with,1.0,1.0
70,159.086462,perfect for,1.0,1.0
555,50.936815,it perfect,1.0,1.0


In [35]:
#25 most "positive" sentiments to capture all sentiments with polarity of 1.0

In [36]:
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [37]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.39375,0.56875


In [38]:
different_cols = df.columns.difference(df_text.columns)

In [39]:
different_cols

Index(['Age', 'Class Name', 'Clothing ID', 'Department Name', 'Division Name',
       'Positive Feedback Count', 'Rating', 'Recommended IND', 'Title'],
      dtype='object')

In [40]:
df_text

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.318750,0.725000
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.500000,0.625000
4,This shirt is very flattering to all due to th...,0.512891,0.568750,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.393750,0.568750
...,...,...,...,...,...,...,...
23481,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[i, wa, very, happy, to, snag, this, dress, at...",i wa very happy to snag this dress at such a g...,0.512667,0.710000
23482,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[it, reminds, me, of, maternity, clothes, soft...",it reminds me of maternity clothes soft stretc...,0.091667,0.708333
23483,"This fit well, but the top was very see throug...",0.414286,0.596429,"[this, fit, well, but, the, top, wa, very, see...",this fit well but the top wa very see through ...,0.414286,0.596429
23484,I bought this dress for a wedding i have this ...,0.322222,0.577778,"[i, bought, this, dress, for, a, wedding, i, h...",i bought this dress for a wedding i have this ...,0.325000,0.560000


In [41]:
#creating df_new to prevent multiple of the same columns being combined in joined df (Review_Text)

In [42]:
df_new = df[different_cols]

In [43]:
df_new

Unnamed: 0,Age,Class Name,Clothing ID,Department Name,Division Name,Positive Feedback Count,Rating,Recommended IND,Title
0,33,Intimates,767,Intimate,Initmates,0,4,1,
1,34,Dresses,1080,Dresses,General,4,5,1,
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt
...,...,...,...,...,...,...,...,...,...
23481,34,Dresses,1104,Dresses,General Petite,0,5,1,Great dress for many occasions
23482,48,Knits,862,Tops,General Petite,0,3,1,Wish it was made of cotton
23483,31,Dresses,1104,Dresses,General Petite,1,3,0,"Cute, but see through"
23484,28,Dresses,1084,Dresses,General,2,3,1,"Very cute dress, perfect for summer parties an..."


In [44]:
joined_df = pd.concat([df_new, df_text], axis=1, join='inner')
joined_df

Unnamed: 0,Age,Class Name,Clothing ID,Department Name,Division Name,Positive Feedback Count,Rating,Recommended IND,Title,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.318750,0.725000
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.500000,0.625000
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.568750,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.393750,0.568750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23481,34,Dresses,1104,Dresses,General Petite,0,5,1,Great dress for many occasions,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[i, wa, very, happy, to, snag, this, dress, at...",i wa very happy to snag this dress at such a g...,0.512667,0.710000
23482,48,Knits,862,Tops,General Petite,0,3,1,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[it, reminds, me, of, maternity, clothes, soft...",it reminds me of maternity clothes soft stretc...,0.091667,0.708333
23483,31,Dresses,1104,Dresses,General Petite,1,3,0,"Cute, but see through","This fit well, but the top was very see throug...",0.414286,0.596429,"[this, fit, well, but, the, top, wa, very, see...",this fit well but the top wa very see through ...,0.414286,0.596429
23484,28,Dresses,1084,Dresses,General,2,3,1,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,0.322222,0.577778,"[i, bought, this, dress, for, a, wedding, i, h...",i bought this dress for a wedding i have this ...,0.325000,0.560000


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 1]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
ngram_1 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'ngram'})

In [46]:
#showing n_grams for items rated 1 with ngram range (1,2)

In [47]:
ngram_1[:20]

Unnamed: 0,frequency,ngram
0,96.497071,the
1,64.453134,it
2,53.08353,and
3,43.465043,is
4,37.747822,this
5,36.974899,wa
6,35.768199,to
7,33.075865,in
8,29.405706,of
9,29.166091,on


In [48]:
#interesting, even though I listed ngram range of (1,2), the top values are all bigrams

In [49]:
joined_df['Rating'].value_counts()

5    12539
4     4908
3     2823
2     1549
1      821
Name: Rating, dtype: int64

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 5]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
ngram_5 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'ngram'})

In [51]:
#showing n_grams for items rated 5 with n_gram range (1,2)

In [52]:
ngram_5[:20]

Unnamed: 0,frequency,ngram
0,1243.473367,the
1,990.386261,it
2,947.103321,and
3,692.016002,is
4,579.773783,this
5,569.425519,to
6,513.408402,in
7,426.636788,with
8,426.610905,dress
9,415.661073,for


In [53]:
##will save this dataset: Note this does NOT add additional stop words like other data set AND removed very from stop words

In [54]:
joined_df = joined_df.rename(columns={'Clothing ID' : 'Clothing_ID', 'Review Text': 'Review_Text', 'Recommended IND': 'Recommended_IND', 'Positive Feedback Count':'Positive_Feedback_Count', 'Division Name': 'Division_Name', 'Class Name': 'Class_Name', 'polarity': 'Polarity', 'subjective': 'Subjective', 'polarity_clean': 'Polarity_Clean', 'subjective_clean': 'Subjective_Clean'})

In [55]:
joined_df.head()

Unnamed: 0,Age,Class_Name,Clothing_ID,Department Name,Division_Name,Positive_Feedback_Count,Rating,Recommended_IND,Title,Review_Text,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.31875,0.725
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.5,0.625
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.56875,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.39375,0.56875


In [56]:
#adding Positive_Rating Columnn

In [57]:
def target_label(row):
    if row['Rating'] == 4:
      return 1
    elif row['Rating'] == 5:
        return 1
    else:
        return 0

In [58]:
joined_df['Positive_Rating'] = df.apply(lambda row: target_label(row), axis=1)

In [59]:
joined_df.head()

Unnamed: 0,Age,Class_Name,Clothing_ID,Department Name,Division_Name,Positive_Feedback_Count,Rating,Recommended_IND,Title,Review_Text,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean,Positive_Rating
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, and, sexy, and,...",absolutely wonderful silky and sexy and comfor...,0.633333,0.933333,1
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, this, dress, it's, sooo, pretty, i, hap...",love this dress it's sooo pretty i happened to...,0.31875,0.725,1
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[i, had, such, high, hope, for, this, dress, a...",i had such high hope for this dress and really...,0.076392,0.356294,0
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[i, love, love, love, this, jumpsuit, it's, fu...",i love love love this jumpsuit it's fun flirty...,0.5,0.625,1
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.56875,"[this, shirt, is, very, flattering, to, all, d...",this shirt is very flattering to all due to th...,0.39375,0.56875,1


In [60]:
#save dataset used for bigrams

In [61]:
joined_df.to_csv('./data/bigram_data.csv', index=False)