# Text Analysis Exercise 2 -- Larry Larkin

## Women's Clothing Review


In [1]:
#import libraries
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation

#if the next cell does not work
#remove number symbol on following lines and re-run this cell
#RUN THESE COMMANDS JUST ONCE PER COMPUTER -- NO NEED TO REPEAT FOR EACH SESSION
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('names')
#nltk.download('stopwords')
#nltk.download('vader_lexicon')

%matplotlib inline

In [2]:
# Load csv file data with headers

location = "Datasets/women_clothing_review.csv"
df = pd.read_csv(location)

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df.shape

(23486, 11)

In [5]:
# Check how many "Review Text" cells are missing values
df['Review Text'].isna().sum()

845

In [6]:
# Drop rows where "Review Text" is missing
df_reviews = df[pd.notna(df['Review Text'])]

In [7]:
df_reviews.shape

(22641, 11)

In [8]:
# Ensure all "Review Text" NAs have been removed
df_reviews['Review Text'].isna().sum()

0

In [9]:
df_reviews.head(5)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


### Sentiment Analysis


#### Make a sentiment value column in a dataframe

Add a new column to the dataset that will have a numerical value for the sentiment of each review.

In [12]:
# Initialize function to do sentiment analysis
sid = SentimentIntensityAnalyzer()

In [13]:
# Create a function to clean up each review
# then it will analyze and assign a sentiment polarity

eng_stopwords = stopwords.words('english')

def reviewSentiment(review):
    
    #make text lowercase
    review = review.lower()
    
    #tokenize the review
    tknz_review = word_tokenize(review)
    
    #remove puntuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    clean_tokens = []
    #remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    #clean_review = ' '.join(tknz_review)
    
    #turn into textblob
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity
    r_comp = sid_rev['compound']
    
    return r_comp

In [14]:
#create a new column to hold sentiment value from function
#df_reviews['ReviewSentiment'] = df_reviews['Review Text'].apply(reviewSentiment)
df_reviews['ReviewSentiment'] = df_reviews['Review Text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
# Verify sentiment values in new column
df_reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,ReviewSentiment
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117


THE END