# Import Libraries

In [104]:
import pandas as pd
import plotly as py
import cufflinks as cf
from textblob import TextBlob
import plotly.graph_objs as go
from sklearn.feature_extraction.text import CountVectorizer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 

In [2]:
init_notebook_mode(connected=True)
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Load Dataset

In [3]:
df = pd.read_csv('clothing_reviews.csv', index_col=0)
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df = df.drop('Title',axis=1)

In [5]:
df = df.dropna()

In [6]:
def preprocess(ReviewText):
    
    ReviewText = ReviewText.str.replace('[^a-zA-Z ]','')
    return ReviewText

df['Review Text'] = preprocess(df['Review Text'])
df['Polarity'] = df['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)
df['Review Length'] = df['Review Text'].astype(str).apply(len)
df['Word Count'] = df['Review Text'].apply(lambda x: len(str(x).split()))

# Data after Preprocessing

In [36]:
df.head()

Unnamed: 0,Clothing ID,Age,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Polarity,Review Length,Word Count
0,767,33,Absolutely wonderful silky and sexy and comfo...,4,1,0,Initmates,Intimate,Intimates,0.633333,52,7
1,1080,34,Love this dress its sooo pretty i happened t...,5,1,4,General,Dresses,Dresses,0.31875,288,61
2,1077,60,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.076392,486,97
3,1049,50,I love love love this jumpsuit its fun flirty ...,5,1,0,General Petite,Bottoms,Pants,0.5,115,22
4,847,47,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.39375,187,36


# Top Five Positive Comments with highest Polarity

In [79]:
print('Maximum Polarity : ',df['Polarity'].max())

Maximum Polarity :  1.0


In [77]:
positive_text = df[df['Polarity']==1.0]['Review Text'].head()

for i in positive_text:
    print(i)
    print('--------')

This is a perfect jacket over any shirt tee or dress jacket is well made and goes with anything
--------
If there ever was the perfect feminine dress this would be it 
--------
Perfect way to add warmth in transition weather
--------
This is awesome multiseason dress
--------
Perfect cant believe how inexpensive this is for the look it looks just like it does on the model i cant wait to wear it on my vacation
--------


# Top Five Negative Comments with lowest Polarity

In [80]:
print("Minimum Polarity : ",df['Polarity'].min())

Minimum Polarity :  -0.9750000000000001


In [74]:
negative_text = df[df['Polarity'] <= -0.6]['Review Text'].head()

for i in negative_text:
    print(i)
    print('--------')

What a disappointment and for the price its outrageous
--------
Awful color horribly wrinkled and just a messso disappointed
--------
Cut out design no seems or hems very disappointed in retailer
--------
I would have bought this had it not been for the way the pockets are sewn in they are attached on the inside by a string if the string broke the pockets would dangle for the price you would think they would have sewn pockets in to the vest too bad
--------
Received this product with a gaping hole in it very disappointed in the quality and the quality control at the warehouse
--------


# Percentage of Positive, Negative and Neutral Reviews

In [102]:
print(round(df[df['Polarity'] == 0.0].shape[0]/df['Polarity'].shape[0] * 100,2),' Percent Reviews are Neutral')

print(round(df[df['Polarity'] < 0.0].shape[0]/df['Polarity'].shape[0] * 100,2),' Percent Reviews are Negative')

print(round(df[df['Polarity'] > 0.0].shape[0]/df['Polarity'].shape[0] * 100,2),' Percent Reviews are Positive')


0.44  Percent Reviews are Neutral
5.92  Percent Reviews are Negative
93.64  Percent Reviews are Positive


# Polarity Distribution

In [7]:
df['Polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

Clearly the data is very unbalanced in terms of Polarity as 94% of reviews are Positive or Neutral and only 6% of reviews are Negative. This fact is supported with the percent calculation done in previous step. 
Also evident from Polarity distribution graph that most of the reviews are either Neutral or Positive because distribution of graph is maximum between 0 and 0.4 There are a very few Negative reviews.

So here we can conclude that there is a high posibility that the Ratings distribution will be more towards 4 and 5. To prove this, let's plot distribution of Ratings 

# Distribution of Ratings

In [18]:
df['Rating'].iplot(
    kind='hist',
    bins=5,colors = '#0a7f87',
    xTitle='Ratings',
    linecolor='black',
    yTitle='count',
    title='Distribution of Ratings')

# Age Distribution of Reviewers

In [9]:
df['Age'].iplot(
    kind='hist',
    bins=50,colors = 'blue',
    xTitle='Age',
    linecolor='black',
    yTitle='count',
    title='Reviewrs Age Distribution')

 Most Reviewrs are of age between 30 to 45 and maximum are of age 38-39. As the distribution is more between this range. Many reviewers with age more than 80 are also there. There are 2 whose age is 99, these may be outliers but we do not have any evidence as now every age group people have access to internet and are free to provide feedback. 

# Length of Reviews

In [13]:
df['Review Length'].iplot(
    kind='hist',
    bins=100,colors = '#06bf38',
    xTitle='Review Length',
    linecolor='black',
    yTitle='count',
    title='Distribution of Review Length')

There are a lot of people who writes long reviews as more than 5000 people have written reviews of length more than 450 words. Also there are people who have written reviews in less than even 50 words.

# Distribution of Word Counts

In [14]:
df['Word Count'].iplot(
    kind='hist',
    bins=100,colors='#bf0673',
    xTitle='Word Count',
    linecolor='black',
    yTitle='count',
    title='Distribution of Word Count')

# Count of Each Division

In [16]:
df.groupby('Division Name').count()['Clothing ID'].iplot(
    kind='bar',colors='blue',
    xTitle='Division Name',
    linecolor='black',
    yTitle='count',
    title='Bar Chart For Division Name')

There are 3 divisions of cloths. Count of *'General division'* is more than 13000, count of *'General Petite'* is almost 8000 and *'Initmates'* division are with least count of 1400.

# Count for each Department

In [17]:
df.groupby('Department Name').count()['Clothing ID'].sort_values(ascending=False).iplot(
    kind='bar',colors='#82da72',
    xTitle='Department Name',
    linecolor='black',
    yTitle='count',
    title='Bar Chart For Department Name')

Here Cloths are divided into 6 departments. Count of Tops department is maximum with 10000 articles and Trend department's count is minimum with only 118 articles. So the reviews are maximum for top wears such as T-shirts, shirts, jackets, sweaters and girls stuffs. It can be concluded that most of the people buy Top wears, followed by dresses and bottom wears. 

# Count of each Class

In [19]:
df.groupby('Class Name').count()['Clothing ID'].sort_values(ascending=False).iplot(
    kind='bar',
    xTitle='Class Name',
    linecolor='black',
    yTitle='count',
    title='Bar Chart For Classes')

# Unigrams Before Removing Stop Words

In [108]:
def top_words_no_stop(data, n):
    
    cv = CountVectorizer().fit(data)
    bag_of_words = cv.transform(data)
    sum_of_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

top = top_words_no_stop(df['Review Text'], 20)

df2 = pd.DataFrame(top, columns = ['Review Text' , 'count'])

df2.groupby('Review Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', colors='#30870a',
    yTitle='Count', 
    xTitle='Unigrams',
    linecolor='black', 
    title='Top Unigrams in review before removing stop words')

Unigram means single words. Here is the count of Top 20 single words appeared in reviews. Most of these are stop words that is are not relevent to decide polarity in text. We need to remove these words so that we can get actual significant words that contributes in deciding the polarity of text.

# Unigrams After Removing Stop Words

In [107]:
def top_words_with_stop(data, n):
    
    cv = CountVectorizer(stop_words = 'english').fit(data)
    bag_of_words = cv.transform(data)
    sum_of_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

top = top_words_with_stop(df['Review Text'], 20)

df2 = pd.DataFrame(top, columns = ['Review Text' , 'count'])

df2.groupby('Review Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar',colors='#30f40d', 
    yTitle='Count',
    xTitle='Unigrams',
    linecolor='black', 
    title='Top Unigrams in review after removing stop words')

After Removing Stop Words we get the actual significant words from reviews. As evident from graph there are no stop words present in the list of words with maximum counts.

# Bigrams Before Removing Stop Words

In [106]:
def bigram_top_words_no_stop(data, n):
    
    cv = CountVectorizer(ngram_range=(2,2)).fit(data)
    bag_of_words = cv.transform(data)
    sum_of_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

top = bigram_top_words_no_stop(df['Review Text'], 20)

df3 = pd.DataFrame(top, columns = ['Review Text' , 'count'])

df3.groupby('Review Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count',
    xTitle='Bigrams',
    linecolor='black', 
    title='Top Bigrams in review before removing stop words')

Bigrams are combination of two words that appear together. Here the stop words are not removed and most of bigrams here contains stop words. So again as done unigrams we will remove the stop word to get actual significant word and plot them

# Bigrams After Removing Stop Words

In [105]:
def bigram_top_words_with_stop(data, n):
    
    cv = CountVectorizer(stop_words = 'english',ngram_range=(2,2)).fit(data)
    bag_of_words = cv.transform(data)
    sum_of_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_of_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

top = bigram_top_words_with_stop(df['Review Text'], 20)

df4 = pd.DataFrame(top, columns = ['Review Text' , 'count'])

df4.groupby('Review Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', colors='blue',
    yTitle='Count',
    xTitle='Bigrams',
    linecolor='black', 
    title='Top Bigrams in review after removing stop words')

# Sentiment Polarity for Department Name

In [112]:
y0 = df.loc[df['Department Name'] == 'Tops']['Polarity']
y1 = df.loc[df['Department Name'] == 'Dresses']['Polarity']
y2 = df.loc[df['Department Name'] == 'Bottoms']['Polarity']
y3 = df.loc[df['Department Name'] == 'Intimate']['Polarity']
y4 = df.loc[df['Department Name'] == 'Jackets']['Polarity']
y5 = df.loc[df['Department Name'] == 'Trend']['Polarity']

b0 = go.Box(
    y=y0,
    name = 'Tops'
)
b1 = go.Box(
    y=y1,
    name = 'Dresses'
    
)
b2 = go.Box(
    y=y2,
    name = 'Bottoms'
)
b3 = go.Box(
    y=y3,
    name = 'Intimate'
)
b4 = go.Box(
    y=y4,
    name = 'Jackets'
)
b5 = go.Box(
    y=y5,
    name = 'Trend'
)
data = [b0, b1, b2, b3, b4, b5]
layout = go.Layout(
    title = "Sentiment Polarity for Department Name"  
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Sentiment Polarity for Department Name")

These are the box plots showing polarity for each department. That is how polar the reviews are or what people reviewed about each department. 
1. For Tops department most of the reviews are positive and only a few of them are negative. But as compared with other, Tops department have maximum number of negative reviews and the most negative comment is also for Tops.
2. Now Dresses department have almost same distribution as Tops.
3. Bottoms department have less number of Negative reviews as compared to Tops and Dresses but positivity range is almost same.
4. Intimate department have almost same distribution as Bottoms.
5. Jackets reviews are even less negative than Bottoms but there positivity range is also less.
6. Trend being the less reviewed among all departments have only a couple of negative reviews and its positivity range is also least among all.

So from this distribution also we can conclude that most of the reviews are positive and only a few are negative (As concluded in earlier distributions).

# Rating for Department Name

In [30]:
y0 = df.loc[df['Department Name'] == 'Tops']['Rating']
y1 = df.loc[df['Department Name'] == 'Dresses']['Rating']
y2 = df.loc[df['Department Name'] == 'Bottoms']['Rating']
y3 = df.loc[df['Department Name'] == 'Intimate']['Rating']
y4 = df.loc[df['Department Name'] == 'Jackets']['Rating']
y5 = df.loc[df['Department Name'] == 'Trend']['Rating']

r0 = go.Box(
    y=y0,
    name = 'Tops'
)
r1 = go.Box(
    y=y1,
    name = 'Dresses'
)
r2 = go.Box(
    y=y2,
    name = 'Bottoms'
)
r3 = go.Box(
    y=y3,
    name = 'Intimate'
)
r4 = go.Box(
    y=y4,
    name = 'Jackets'
)
r5 = go.Box(
    y=y5,
    name = 'Trend'
)
data = [r0, r1, r2, r3, r4, r5]
layout = go.Layout(
    title = "Rating by Department Name"
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Rating by Department Name")

This is Ratings distribution based on departments. 5 departments - Tops, Dresses, Bottoms, Intimate, Jackets have exactly the same distribution. Maximum rating Between 3 to 5. That is products in these departments satisfied the people. Also most of the ratings are either 4 or 5.

Trend department however have different story. Many reviews are even rated 1 or 2 in this department and 25 percent of reviews are rated as 3. This makes it the worst rated department.

# Polarity of Reviews based on Recommendations

In [32]:
x1 = df.loc[df['Recommended IND'] == 1, 'Polarity']
x0 = df.loc[df['Recommended IND'] == 0, 'Polarity']

trace1 = go.Histogram(
    x=x0, name='Not recommended',
    opacity=0.85
)
trace2 = go.Histogram(
    x=x1, name = 'Recommended',
    opacity=0.85
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay', colorway=['#f44c0d','green'], 
                   title='Distribution of Sentiment polarity of reviews based on Recommendation')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='histogram')

This is a polarity distribution based on recommendation. That is how much people have recommended the products to others. So here products with positive reviews are also not recommended by people and many products with negative reviews are recommended. That is -

Range of Recommended       :   -0.2 to 1.0

Range of Not Recommended   :   -0.6 to 0.6 

# Rating of Reviews based on Recommendations

In [35]:
x1 = df.loc[df['Recommended IND'] == 1, 'Rating']
x0 = df.loc[df['Recommended IND'] == 0, 'Rating']

trace1 = go.Histogram(
    x=x0, name='Not recommended',
    opacity=0.75,nbinsx=7
)
trace2 = go.Histogram(
    x=x1, name = 'Recommended',nbinsx=7,
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay', colorway=['coral','blue'],
                   title='Distribution of Sentiment polarity of reviews based on Recommendation')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='overlaid histogram')

# Conclusion

1. 94% of Reviews are either positive or neutral
2. 6% of Reviews are negative
3. Supporting above points, most of the product ratings are either 4 or 5
4. Cloths are divided into 6 departments with Tops having maximum number of products and Trend having minimum.
5. Most of the people bought Top wear like T-shirts, shirts, jackets, sweaters etc
6. Products in Trend department are the worst rated.
7. Even products with positive reviews were not recommended by people and many products with negative reviews were recommended.
8. All the 5 rated products were recommended. Many 1 and 2 rated products were also recommended by people.