In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [2]:
#Required text pre-processing libraries are imported
import string
import nltk
import re

# download the stopwords and wordnet corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
# import tokenize from nltk library
from nltk import tokenize
# import WordNetLemmatizer from nltk library
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist

#Required data visualisation libraries are imported
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jillian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_csv('./data/Womens Clothing E-Commerce Reviews 2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
df = df[df['Review Text'].isna()==False]

In [6]:
df.drop(columns='Unnamed: 0', inplace=True)

In [7]:
df[df.duplicated() == True]

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
21888,1022,37,,"Love, love these jeans. being short they come ...",5,1,0,General,Bottoms,Jeans


In [8]:
df.drop(21888, inplace=True)

In [9]:
df_text = pd.DataFrame(df['Review Text'])

In [10]:
df_text

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


In [11]:
from textblob import TextBlob
df_text['polarity'] = df_text['Review Text'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective'] = df_text['Review Text'].apply(lambda x: TextBlob(x).subjectivity)

In [12]:
df_text

Unnamed: 0,Review Text,polarity,subjective
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000
2,I had such high hopes for this dress and reall...,0.073675,0.356294
3,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000
4,This shirt is very flattering to all due to th...,0.512891,0.568750
...,...,...,...
23481,I was very happy to snag this dress at such a ...,0.552667,0.710000
23482,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333
23483,"This fit well, but the top was very see throug...",0.414286,0.596429
23484,I bought this dress for a wedding i have this ...,0.322222,0.577778


In [13]:
#remove regular expressions

# write a regular expression to identify urls in text
url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# write a regular expression to identify non-ascii characters in text
non_ascii_regex = r'[^\x00-\x7F]+'

In [14]:
# write a function to regular expression cleaning

def clean_regex(text_variable):
    
    # use library re to replace urls by token - urlplaceholder
    text_variable = re.sub(url_regex, 'urlplaceholder', text_variable)    
    
    # use library re to replace non ascii characters by a space
    text_variable = re.sub(non_ascii_regex, ' ', text_variable)

    return text_variable  

In [15]:
# Created a new column by vectorizing function to the raw review text
df_text['Clean'] = np.vectorize(clean_regex)(df["Review Text"])

In [16]:
#Convert all texts to lowercase
df_text.Clean = df_text.Clean.str.lower()

In [17]:
#Remove punctuation

def punctuation_removal(punc):
    rem_punc = [i for i in punc if i not in string.punctuation]
    after_punc = ''.join(rem_punc)
    return after_punc

In [18]:
df_text.Clean = df_text.Clean.apply(punctuation_removal)
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,absolutely wonderful silky and sexy and comfo...
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,love this dress its sooo pretty i happened t...
2,I had such high hopes for this dress and reall...,0.073675,0.356294,i had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,i love love love this jumpsuit its fun flirty ...
4,This shirt is very flattering to all due to th...,0.512891,0.56875,this shirt is very flattering to all due to th...


In [19]:
# Instantiate a word net lemmatizer object and create helper function to apply it to the dataframe.
lemmatizer = WordNetLemmatizer()

def tokenize_lem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in tokenize.sent_tokenize(text) for word in tokenize.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return [lemmatizer.lemmatize(token) for token in filtered_tokens]


In [20]:
# Apply the lemmatizer function to df.clean, lemmatizing the words and creating a list.
df_text['Clean'] = df_text['Clean'].apply(tokenize_lem)

In [21]:
sw = stopwords.words('english')

In [22]:
len(sw)

179

In [23]:
sw.pop(sw.index('very'))

'very'

In [24]:
len(sw)

178

In [25]:
# Helper function to remove stopwords
def remove_stopwords(token_list):
    """
    Given a list of tokens, return a list where the tokens
    that are also present in sw have been removed
    """
    stopwords_r = [token for token in token_list if token not in sw]
    return stopwords_r

In [26]:
# Apply the removal function to df.clean
df_text.Clean=df_text.Clean.apply(remove_stopwords)

# create new column from df_clean that converts list of tokens to a string.
df_text['String']=df_text.Clean.str.join(" ")

In [27]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...


In [28]:
df_text['polarity_clean'] = df_text['String'].apply(lambda x: TextBlob(x).polarity)
df_text['subjective_clean'] = df_text['String'].apply(lambda x: TextBlob(x).subjectivity)

In [29]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [30]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22640 entries, 0 to 23485
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Review Text       22640 non-null  object 
 1   polarity          22640 non-null  float64
 2   subjective        22640 non-null  float64
 3   Clean             22640 non-null  object 
 4   String            22640 non-null  object 
 5   polarity_clean    22640 non-null  float64
 6   subjective_clean  22640 non-null  float64
dtypes: float64(4), object(3)
memory usage: 1.4+ MB


## Bigrams using CountVectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = c_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [32]:
df_ngram

Unnamed: 0,frequency,bigram
0,1299,true size
1,1075,fit perfectly
2,895,look great
3,870,very flattering
4,799,look like
...,...,...
1995,31,also ordered
1996,31,across back
1997,30,love very
1998,30,fabric doe


In [33]:
df_ngram['polarity'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram['subjective'] = df_ngram['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [34]:
df_ngram

Unnamed: 0,frequency,bigram,polarity,subjective
0,1299,true size,0.35,0.65
1,1075,fit perfectly,0.70,0.70
2,895,look great,0.80,0.75
3,870,very flattering,0.20,0.30
4,799,look like,0.00,0.00
...,...,...,...,...
1995,31,also ordered,0.00,0.00
1996,31,across back,0.00,0.00
1997,30,love very,0.35,0.45
1998,30,fabric doe,0.00,0.00


In [35]:
df_ngram.polarity.value_counts()

 0.000000    1030
 0.500000     182
 0.400000      89
 0.200000      82
 0.800000      54
             ... 
-0.066667       1
 0.400000       1
 0.563333       1
-0.118750       1
 0.130000       1
Name: polarity, Length: 113, dtype: int64

In [36]:
df_ngram.subjective.value_counts()

0.000000    948
0.500000    142
0.400000    126
1.000000    122
0.750000     77
           ... 
0.816667      1
0.808333      1
0.525000      1
0.325000      1
0.833333      1
Name: subjective, Length: 97, dtype: int64

## Bigrams using TfdifVectorizer

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(df_text['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram_tvec = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})


In [38]:
df_ngram_tvec

Unnamed: 0,frequency,bigram
0,311.599498,true size
1,261.873806,fit perfectly
2,238.825697,look great
3,234.957061,very flattering
4,207.449807,look like
...,...,...
1995,11.270358,regular small
1996,11.258578,wear xl
1997,11.254175,said run
1998,11.036504,hit inch


In [39]:
df_ngram_tvec['polarity'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).polarity)
df_ngram_tvec['subjective'] = df_ngram_tvec['bigram'].apply(lambda x: TextBlob(x).subjectivity)

In [40]:
df_ngram_tvec

Unnamed: 0,frequency,bigram,polarity,subjective
0,311.599498,true size,0.350,0.650000
1,261.873806,fit perfectly,0.700,0.700000
2,238.825697,look great,0.800,0.750000
3,234.957061,very flattering,0.200,0.300000
4,207.449807,look like,0.000,0.000000
...,...,...,...,...
1995,11.270358,regular small,-0.125,0.238462
1996,11.258578,wear xl,0.000,0.000000
1997,11.254175,said run,0.000,0.000000
1998,11.036504,hit inch,0.000,0.000000


In [41]:
df_ngram_tvec.sort_values('polarity')[:20]

Unnamed: 0,frequency,bigram,polarity,subjective
240,44.166572,very disappointed,-0.975,0.975
189,50.001526,wa disappointed,-0.75,0.75
611,26.638104,cold water,-0.6,1.0
537,28.488818,wa afraid,-0.6,0.9
167,52.593276,very thin,-0.52,1.0
685,24.931255,casual look,-0.5,0.866667
1557,14.960843,casual day,-0.5,0.866667
1571,14.907083,dress casual,-0.5,0.866667
717,24.277257,casual dress,-0.5,0.866667
1401,16.008785,casual wear,-0.5,0.866667


In [42]:
#20 most "negative" sentiments

In [43]:
df_ngram_tvec.sort_values('polarity')[-25:]

Unnamed: 0,frequency,bigram,polarity,subjective
28,114.327579,wa perfect,1.0,1.0
1457,15.58937,perfect wear,1.0,1.0
718,24.242341,perfect spring,1.0,1.0
1510,15.207509,look awesome,1.0,1.0
780,22.956844,size perfect,1.0,1.0
567,27.625908,color perfect,1.0,1.0
481,29.825247,perfect dress,1.0,1.0
1628,14.618173,one best,1.0,0.3
1057,19.101428,perfect weight,1.0,1.0
370,35.106826,would perfect,1.0,1.0


In [44]:
#25 most "positive" sentiments to capture all sentiments with polarity of 1.0

In [45]:
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [46]:
df_text.head()

Unnamed: 0,Review Text,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.31875,0.725
2,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [47]:
joined_df = pd.concat([df, df_text], axis=1, join='inner')
joined_df

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Text.1,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.318750,0.725000
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.500000,0.625000
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,This shirt is very flattering to all due to th...,0.512891,0.568750,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.393750,0.568750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[wa, very, happy, snag, dress, great, price, v...",wa very happy snag dress great price very easy...,0.640833,0.762500
23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[reminds, maternity, clothes, soft, stretchy, ...",reminds maternity clothes soft stretchy shiny ...,0.191667,0.708333
23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses,"This fit well, but the top was very see throug...",0.414286,0.596429,"[fit, well, top, wa, very, see, never, would, ...",fit well top wa very see never would worked im...,0.414286,0.596429
23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses,I bought this dress for a wedding i have this ...,0.322222,0.577778,"[bought, dress, wedding, summer, cute, unfortu...",bought dress wedding summer cute unfortunately...,0.416667,0.511111


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 1]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
ngram_1 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})

In [49]:
#showing n_grams for items rated 1

In [50]:
ngram_1[:20]

Unnamed: 0,frequency,bigram
0,15.139443,look fit
1,10.766442,gorgeous color
2,10.558724,wear skirt
3,10.03273,looser fit
4,8.293502,wa unflattering
5,7.976065,wa boxy
6,6.449168,fit much
7,6.385442,dress ordered
8,5.686029,wear cami
9,5.636209,size sold


In [51]:
joined_df['Rating'].value_counts()

5    12539
4     4908
3     2823
2     1549
1      821
Name: Rating, dtype: int64

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 5]['String'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
ngram_5 = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram'})

In [53]:
#showing n_grams for items rated 5

In [54]:
ngram_5[:20]

Unnamed: 0,frequency,bigram
0,207.406141,usually buy
1,192.141776,fit im
2,165.485679,very pretty
3,153.123107,look really
4,131.481958,love ha
5,118.086668,lovely dress
6,115.084638,very large
7,113.088408,fit color
8,96.848483,hip area
9,95.050854,fit great


In [55]:
pd.DataFrame(ngrams)

Unnamed: 0,0
0,"(0, 1965)\t0.3742377295296265\n (0, 850)\t0..."
1,"(0, 1865)\t0.4302110168564908\n (0, 1638)\t..."
2,"(0, 1004)\t0.38116136417011426\n (0, 1179)\..."
3,"(0, 1186)\t0.48616987697553543\n (0, 1825)\..."
4,"(0, 1462)\t0.4314045709506289\n (0, 1160)\t..."
...,...
12534,"(0, 1029)\t0.29233870019959923\n (0, 1619)\..."
12535,"(0, 397)\t0.2529644585888968\n (0, 1830)\t0..."
12536,"(0, 1608)\t0.3386386494667569\n (0, 1226)\t..."
12537,"(0, 362)\t0.33070457416087845\n (0, 173)\t0..."


In [57]:
##will save this dataset for utilizing bigrams: Note this does NOT add additional
#stop words like other data set AND removed very from stop words

In [58]:
joined_df

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Text.1,polarity,subjective,Clean,String,polarity_clean,subjective_clean
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happene...,0.339583,0.725000,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.318750,0.725000
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"I love, love, love this jumpsuit. it's fun, fl...",0.550000,0.625000,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.500000,0.625000
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,This shirt is very flattering to all due to th...,0.512891,0.568750,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.393750,0.568750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses,I was very happy to snag this dress at such a ...,0.552667,0.710000,"[wa, very, happy, snag, dress, great, price, v...",wa very happy snag dress great price very easy...,0.640833,0.762500
23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits,"It reminds me of maternity clothes. soft, stre...",0.091667,0.708333,"[reminds, maternity, clothes, soft, stretchy, ...",reminds maternity clothes soft stretchy shiny ...,0.191667,0.708333
23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses,"This fit well, but the top was very see throug...",0.414286,0.596429,"[fit, well, top, wa, very, see, never, would, ...",fit well top wa very see never would worked im...,0.414286,0.596429
23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses,I bought this dress for a wedding i have this ...,0.322222,0.577778,"[bought, dress, wedding, summer, cute, unfortu...",bought dress wedding summer cute unfortunately...,0.416667,0.511111


In [59]:
joined_df = joined_df.rename(columns={'Clothing ID' : 'Clothing_ID', 'Review Text': 'Review_Text', 'Recommended IND': 'Recommended_IND', 'Positive Feedback Count':'Positive_Feedback_Count', 'Division Name': 'Division_Name', 'Class Name': 'Class_Name', 'polarity': 'Polarity', 'subjective': 'Subjective', 'polarity_clean': 'Polarity_Clean', 'subjective_clean': 'Subjective_Clean'})

In [60]:
joined_df.head()

Unnamed: 0,Clothing_ID,Age,Title,Review_Text,Rating,Recommended_IND,Positive_Feedback_Count,Division_Name,Department Name,Class_Name,Review_Text.1,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.31875,0.725
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875


In [61]:
#adding Positive_Rating Columnn

In [62]:
def target_label(row):
    if row['Rating'] == 4:
      return 1
    elif row['Rating'] == 5:
        return 1
    else:
        return 0

In [63]:
joined_df['Positive_Rating'] = df.apply(lambda row: target_label(row), axis=1)

In [64]:
joined_df.head()

Unnamed: 0,Clothing_ID,Age,Title,Review_Text,Rating,Recommended_IND,Positive_Feedback_Count,Division_Name,Department Name,Class_Name,Review_Text.1,Polarity,Subjective,Clean,String,Polarity_Clean,Subjective_Clean,Positive_Rating
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and comf...,0.633333,0.933333,"[absolutely, wonderful, silky, sexy, comfortable]",absolutely wonderful silky sexy comfortable,0.633333,0.933333,1
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happene...,0.339583,0.725,"[love, dress, sooo, pretty, happened, find, st...",love dress sooo pretty happened find store im ...,0.31875,0.725,1
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,I had such high hopes for this dress and reall...,0.073675,0.356294,"[high, hope, dress, really, wanted, work, init...",high hope dress really wanted work initially o...,0.079865,0.349762,0
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"I love, love, love this jumpsuit. it's fun, fl...",0.55,0.625,"[love, love, love, jumpsuit, fun, flirty, fabu...",love love love jumpsuit fun flirty fabulous ev...,0.5,0.625,1
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,This shirt is very flattering to all due to th...,0.512891,0.56875,"[shirt, very, flattering, due, adjustable, fro...",shirt very flattering due adjustable front tie...,0.39375,0.56875,1


In [65]:
#save dataset used for bigrams

In [66]:
df.to_csv('./data/bigram_data.csv', index=False)