In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [2]:
#Required text pre-processing libraries are imported
import string
import nltk
import re

# download the stopwords and wordnet corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
# import tokenize from nltk library
from nltk import tokenize
# import WordNetLemmatizer from nltk library
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer

#Required data visualisation libraries are imported
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jillian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_csv('../data/Womens Clothing E-Commerce Reviews 2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
#performing same initial data cleaning

In [6]:
df = df[df['Review Text'].isna()==False]

In [7]:
df.drop(columns='Unnamed: 0', inplace=True)

In [8]:
df[df.duplicated() == True]

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
21888,1022,37,,"Love, love these jeans. being short they come ...",5,1,0,General,Bottoms,Jeans


In [9]:
df.drop(21888, inplace=True)

In [10]:
df_text = pd.DataFrame(df['Review Text'])

In [11]:
df_text

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


In [12]:
##adding in polarity and subjectivity of text and seeing how it evolves when text is cleaned

In [13]:
from textblob import TextBlob
df_text['polarity'] = df_text['Review Text'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective'] = df_text['Review Text'].apply(lambda x: TextBlob(x).subjectivity)

In [14]:
df_text

Unnamed: 0,Review Text,polarity,subjective
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000
2,I had such high hopes for this dress and reall...,0.073675,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000
4,This shirt is very flattering to all due to th...,0.512891,0.568750
...,...,...,...
23481,I was very happy to snag this dress at such a ...,0.552667,0.710000
23482,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333
23483,"This fit well, but the top was very see throug...",0.414286,0.596429
23484,I bought this dress for a wedding i have this ...,0.322222,0.577778


In [15]:
df_text['Clean'] = df_text["Review Text"].astype(str)

In [16]:
# Need to remove capitalization, apostrophes, and numbers from the reviews

# Create a tokenizer that will take all words with three or more letters
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)

# Create list of stopwords in English (language of the reviews)
# Remove "very" from the list of stopwords 
stopwords_list = stopwords.words("english")
# Remove "very" from the list of stopwords 
stopwords_list.remove("very")

# Create an instance of nltk's WordNetLemmatizer with the variable name `lemmatizer`
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(text, tokenizer, stopwords_list, lemmatizer):
    # Standardize case (lowercase the text)
    lowered_text = text.lower()
    
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered_text)
    
    # Remove stopwords using `stopwords_list` and removing punctuation and strings with non-alphabetic properties 
    stopped_tokens = [word for word in tokens if word not in stopwords_list and word not in string.punctuation and word.isalpha()]
    
    # Stem the tokenized text using `stemmer`
    lemmatized_text = [lemmatizer.lemmatize(token) for token in stopped_tokens]
    
    # Return the preprocessed text
    return lemmatized_text

In [18]:
# Apply the preprocess function to the entire review text
df_text['Clean'] = df_text['Review Text'].apply(lambda x: preprocess_text(x, tokenizer, stopwords_list, lemmatizer))


In [19]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]"
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st..."
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init..."
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu..."
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro..."


In [20]:
# create new column from df_clean that converts list of tokens to a string.
df_text['String']=df_text["Clean"].str.join(" ")

In [21]:
df_text['polarity_clean'] = df_text['String'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective_clean'] = df_text['String'].apply(lambda x: TextBlob(x).subjectivity)

In [22]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [23]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22640 entries, 0 to 23485
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Review Text       22640 non-null  object 
 1   polarity          22640 non-null  float64
 2   subjective        22640 non-null  float64
 3   Clean             22640 non-null  object 
 4   String            22640 non-null  object 
 5   polarity_clean    22640 non-null  float64
 6   subjective_clean  22640 non-null  float64
dtypes: float64(4), object(3)
memory usage: 1.4+ MB


## Bigrams using CountVectorizer

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = c_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [25]:
df_ngram

Unnamed: 0,frequency,bigram
0,1348,true size
1,1090,fit perfectly
2,902,look great
3,883,very flattering
4,806,look like
...,...,...
1995,29,bit see
1996,29,bit low
1997,29,beautiful look
1998,29,arm fit


In [26]:
df_ngram['polarity'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [27]:
df_ngram

Unnamed: 0,frequency,bigram,polarity,subjective
0,1348,true size,0.35,0.65
1,1090,fit perfectly,0.70,0.70
2,902,look great,0.80,0.75
3,883,very flattering,0.20,0.30
4,806,look like,0.00,0.00
...,...,...,...,...
1995,29,bit see,0.00,0.00
1996,29,bit low,0.00,0.30
1997,29,beautiful look,0.85,1.00
1998,29,arm fit,0.40,0.40


## Ngrams using TfdifVectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
df_ngram_tvec = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [29]:
df_ngram_tvec

Unnamed: 0,frequency,bigram
0,319.881671,true size
1,266.091141,fit perfectly
2,240.854591,look great
3,240.509140,very flattering
4,216.166458,look like
...,...,...
1995,10.900752,neckline very
1996,10.837878,medium usually
1997,10.806020,lb dress
1998,10.553096,large retailer


In [30]:
df_ngram_tvec['polarity'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram_tvec['subjective'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [31]:
df_ngram_tvec

Unnamed: 0,frequency,bigram,polarity,subjective
0,319.881671,true size,0.350000,0.650000
1,266.091141,fit perfectly,0.700000,0.700000
2,240.854591,look great,0.800000,0.750000
3,240.509140,very flattering,0.200000,0.300000
4,216.166458,look like,0.000000,0.000000
...,...,...,...,...
1995,10.900752,neckline very,0.200000,0.300000
1996,10.837878,medium usually,-0.250000,0.250000
1997,10.806020,lb dress,0.000000,0.000000
1998,10.553096,large retailer,0.214286,0.428571


In [32]:
df_ngram_tvec.sort_values('polarity')[:20]

Unnamed: 0,frequency,bigram,polarity,subjective
191,48.173141,very disappointed,-0.975,0.975
1577,14.462974,disappointed quality,-0.75,0.75
531,27.468028,cold water,-0.6,1.0
1813,13.096874,washed cold,-0.6,1.0
144,55.168411,very thin,-0.52,1.0
1347,16.022185,dress casual,-0.5,0.866667
579,26.189895,casual dress,-0.5,0.866667
1236,16.833782,casual wear,-0.5,0.866667
568,26.371354,casual look,-0.5,0.866667
1355,15.948621,casual day,-0.5,0.866667


In [33]:
#20 most "negative" sentiments

In [34]:
df_ngram_tvec.sort_values('polarity')[-25:]

Unnamed: 0,frequency,bigram,polarity,subjective
966,19.520276,very beautiful,1.0,1.0
452,29.712709,perfect dress,1.0,1.0
941,19.796405,perfect work,1.0,1.0
317,36.324116,would perfect,1.0,1.0
1628,14.162816,jean perfect,1.0,1.0
1116,17.897952,look best,1.0,0.3
1780,13.274753,fabric perfect,1.0,1.0
268,39.140408,size perfect,1.0,1.0
532,27.445083,perfect spring,1.0,1.0
1542,14.666039,perfect amount,1.0,1.0


In [35]:
#25 most "positive" sentiments to capture all sentiments with polarity of 1.0

In [36]:
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [37]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [38]:
different_cols = df.columns.difference(df_text.columns)

In [39]:
different_cols

Index(['Age', 'Class Name', 'Clothing ID', 'Department Name', 'Division Name',
       'Positive Feedback Count', 'Rating', 'Recommended IND', 'Title'],
      dtype='object')

In [40]:
df_text

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.318750,0.725000
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.500000,0.625000
4,This shirt is very flattering to all due to th...,0.512891,0.568750,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.393750,0.568750
...,...,...,...,...,...,...,...
23481,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[very, happy, snag, dress, great, price, very,...",very happy snag dress great price very easy sl...,0.640833,0.762500
23482,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[reminds, maternity, clothes, soft, stretchy, ...",reminds maternity clothes soft stretchy shiny ...,0.191667,0.708333
23483,"This fit well, but the top was very see throug...",0.414286,0.596429,"[fit, well, top, very, see, never, would, work...",fit well top very see never would worked glad ...,0.414286,0.596429
23484,I bought this dress for a wedding i have this ...,0.322222,0.577778,"[bought, dress, wedding, summer, cute, unfortu...",bought dress wedding summer cute unfortunately...,0.416667,0.511111


In [41]:
#creating df_new to prevent multiple of the same columns being combined in joined df (Review_Text)

In [42]:
df_new = df[different_cols]

In [43]:
df_new

Unnamed: 0,Age,Class Name,Clothing ID,Department Name,Division Name,Positive Feedback Count,Rating,Recommended IND,Title
0,33,Intimates,767,Intimate,Initmates,0,4,1,
1,34,Dresses,1080,Dresses,General,4,5,1,
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt
...,...,...,...,...,...,...,...,...,...
23481,34,Dresses,1104,Dresses,General Petite,0,5,1,Great dress for many occasions
23482,48,Knits,862,Tops,General Petite,0,3,1,Wish it was made of cotton
23483,31,Dresses,1104,Dresses,General Petite,1,3,0,"Cute, but see through"
23484,28,Dresses,1084,Dresses,General,2,3,1,"Very cute dress, perfect for summer parties an..."


In [44]:
joined_df = pd.concat([df_new, df_text], axis=1, join='inner')
joined_df

Unnamed: 0,Age,Class Name,Clothing ID,Department Name,Division Name,Positive Feedback Count,Rating,Recommended IND,Title,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.318750,0.725000
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.500000,0.625000
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.568750,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.393750,0.568750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23481,34,Dresses,1104,Dresses,General Petite,0,5,1,Great dress for many occasions,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[very, happy, snag, dress, great, price, very,...",very happy snag dress great price very easy sl...,0.640833,0.762500
23482,48,Knits,862,Tops,General Petite,0,3,1,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[reminds, maternity, clothes, soft, stretchy, ...",reminds maternity clothes soft stretchy shiny ...,0.191667,0.708333
23483,31,Dresses,1104,Dresses,General Petite,1,3,0,"Cute, but see through","This fit well, but the top was very see throug...",0.414286,0.596429,"[fit, well, top, very, see, never, would, work...",fit well top very see never would worked glad ...,0.414286,0.596429
23484,28,Dresses,1084,Dresses,General,2,3,1,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,0.322222,0.577778,"[bought, dress, wedding, summer, cute, unfortu...",bought dress wedding summer cute unfortunately...,0.416667,0.511111


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 1]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
ngram_1 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'ngram'})

In [46]:
#showing n_grams for items rated 1 with ngram range (1,2)

In [47]:
ngram_1[:20]

Unnamed: 0,frequency,ngram
0,35.317672,dress
1,28.746217,very
2,28.135372,like
3,27.207925,look
4,26.517836,top
5,22.981598,fit
6,22.336542,fabric
7,20.460916,size
8,20.173883,shirt
9,19.072373,back


In [48]:
#interesting, even though I listed ngram range of (1,2), the top values are all bigrams

In [49]:
joined_df['Rating'].value_counts()

5    12539
4     4908
3     2823
2     1549
1      821
Name: Rating, dtype: int64

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 5]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = t_vec.vocabulary_
ngram_5 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'ngram'})

In [51]:
#showing n_grams for items rated 5 with n_gram range (1,2)

In [52]:
ngram_5[:20]

Unnamed: 0,frequency,ngram
0,543.754137,dress
1,506.388118,love
2,455.177814,fit
3,415.750002,size
4,404.449984,great
5,401.797037,top
6,386.889436,very
7,370.276214,color
8,357.469516,wear
9,325.887982,look


In [53]:
##will save this dataset: Note this does NOT add additional stop words like other data set AND removed very from stop words

In [54]:
joined_df = joined_df.rename(columns={'Clothing ID' : 'Clothing_ID', 'Review Text': 'Review_Text', 'Recommended IND': 'Recommended_IND', 'Positive Feedback Count':'Positive_Feedback_Count', 'Division Name': 'Division_Name', 'Class Name': 'Class_Name', 'polarity': 'Polarity', 'subjective': 'Subjective', 'polarity_clean': 'Polarity_Clean', 'subjective_clean': 'Subjective_Clean'})

In [55]:
joined_df.head()

Unnamed: 0,Age,Class_Name,Clothing_ID,Department Name,Division_Name,Positive_Feedback_Count,Rating,Recommended_IND,Title,Review_Text,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.31875,0.725
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [56]:
#adding Positive_Rating Columnn

In [57]:
def target_label(row):
    if row['Rating'] == 4:
      return 1
    elif row['Rating'] == 5:
        return 1
    else:
        return 0

In [58]:
joined_df['Positive_Rating'] = df.apply(lambda row: target_label(row), axis=1)

In [59]:
joined_df.head()

Unnamed: 0,Age,Class_Name,Clothing_ID,Department Name,Division_Name,Positive_Feedback_Count,Rating,Recommended_IND,Title,Review_Text,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean,Positive_Rating
0,33,Intimates,767,Intimate,Initmates,0,4,1,,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333,1
1,34,Dresses,1080,Dresses,General,4,5,1,,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store gla...,0.31875,0.725,1
2,60,Dresses,1077,Dresses,General,0,3,0,Some major design flaws,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762,0
3,50,Pants,1049,Bottoms,General Petite,0,5,1,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625,1
4,47,Blouses,847,Tops,General,6,5,1,Flattering shirt,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875,1


In [60]:
#save dataset used for bigrams

In [61]:
joined_df.to_csv('../data/bigram_data.csv', index=False)