### import data

#### save as dataframe called raw

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
pd.options.mode.chained_assignment = None
from IPython.display import HTML
import numpy as np
import seaborn as sns
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
plt.style.use('fivethirtyeight')
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

In [2]:
raw = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
 
 
raw.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


### Data Cleaning

#### Drop and rename variables

In [3]:
raw = raw.drop([
'Unnamed: 0'], axis=1)

In [4]:
raw.rename(columns={
    'Clothing ID': 'ID',
    'Age': 'AGE',
    'Title': 'TITLE',
    'Review Text': 'REVIEW',
    'Rating': 'RATING',
    'Recommended IND': 'RECOMMENDED',
    'Positive Feedback Count': 'POS_FEEDBACK',
    'Division Name': 'DIVISION',
    'Department Name': 'DEPARTMENT',
    'Class Name': 'CLASS',}, inplace=True)

#### identify missing values - if REVIEW is blank then delete observation, otherwise fill/ignore

In [5]:
print(raw.isnull().sum())

ID                 0
AGE                0
TITLE           3810
REVIEW           845
RATING             0
RECOMMENDED        0
POS_FEEDBACK       0
DIVISION          14
DEPARTMENT        14
CLASS             14
dtype: int64


In [6]:
raw.CLASS.fillna(value='Unknown', inplace=True)
raw.DIVISION.fillna(value='Unknown', inplace=True)
raw.DEPARTMENT.fillna(value='Unknown', inplace=True)
raw.TITLE.fillna(value='Blank', inplace=True)

raw = raw.dropna()

In [7]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 10 columns):
ID              22641 non-null int64
AGE             22641 non-null int64
TITLE           22641 non-null object
REVIEW          22641 non-null object
RATING          22641 non-null int64
RECOMMENDED     22641 non-null int64
POS_FEEDBACK    22641 non-null int64
DIVISION        22641 non-null object
DEPARTMENT      22641 non-null object
CLASS           22641 non-null object
dtypes: int64(5), object(5)
memory usage: 1.9+ MB


#### creating Age Bucket

In [8]:
custom_bucket_array = np.linspace(10, 90, 10, endpoint=False)
custom_bucket_array

array([10., 18., 26., 34., 42., 50., 58., 66., 74., 82.])

In [9]:
raw['AGE_BUCKET'] = pd.cut(raw['AGE'], custom_bucket_array)
raw.head()


Unnamed: 0,ID,AGE,TITLE,REVIEW,RATING,RECOMMENDED,POS_FEEDBACK,DIVISION,DEPARTMENT,CLASS,AGE_BUCKET
0,767,33,Blank,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,"(26.0, 34.0]"
1,1080,34,Blank,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,"(26.0, 34.0]"
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,"(58.0, 66.0]"
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"(42.0, 50.0]"
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,"(42.0, 50.0]"


#### use code below to filter values as needed (class = dresses, age bracket, etc)

In [10]:
#raw = raw[(raw.RATING == 1)
#            |(raw.RATING == 1)
#            |(raw.POS_FEEDBACK == 0)
#           ]  

### create word frequencies table
#### exclude stop words, ngram opens up to multi-word phrases (min, max), max features tells how many top words/phrases we want to see, lowercase

In [10]:
vect = CountVectorizer(stop_words='english'
        ,max_features = 10
        ,ngram_range=(1, 5)
        , lowercase =False
                      )
 
raw_review = vect.fit_transform(raw['REVIEW'].values.astype('U'))
 
 
#print(vect.get_feature_names()[-50:])
#vect.vocabulary_

#stop_words=['dress', 'really'] replace once we have list

In [11]:
review_words = pd.DataFrame(raw_review.toarray(), columns=vect.get_feature_names())
 
review_words.head()

Unnamed: 0,dress,fabric,fit,great,just,like,love,size,small,wear
0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,0,0
2,1,0,1,0,1,0,0,1,3,0
3,0,0,0,1,0,0,3,0,0,1
4,0,0,0,0,0,0,1,0,0,1


In [12]:
review_words.sum()

dress     10533
fabric     4787
fit        7304
great      5791
just       5464
like       7077
love       7867
size       8763
small      4729
wear       6438
dtype: int64

### next steps...
#### install textblob for stemming, sentiment analysis, classification
#### TF-IDF (term frequency-inverse document frequency) to find meaningful words
#### install Textatistic for Flesch Kincaid readability
#### customize stopwords - really, dress, etc provide 0 value



### LANEY ADDING STUFF HERE
#### stemming

In [13]:

import nltk
import string
import re

porter = nltk.stem.porter.PorterStemmer()
#raw_wrds =  pd.Series(raw_wrds).values
#raw_wrds = np.array(raw_wrds).tolist()
words= raw['REVIEW']
wd = words.values
wd = str(words.values)
wds = wd.split()

stem_words = [[porter.stem(word) for word in sentence.split(" ")] for sentence in wds]

#print(stem_words)

#### Takes in a string of text, then performs the following: 1. Remove all punctuation, 2. Remove all stopwords, 3. Return the cleaned text as a list of words


In [15]:
words = raw['REVIEW']

import string
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]

    nopunc = ''.join(nopunc)

    #return [word for word in nopunc.split()]

    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

sample_text = "Hey there! This is a sample review, which happens to contain punctuations."

new = text_process(raw['REVIEW'])

#new

In [None]:
new[0:10]

#### TF-IDF

In [16]:

 #  Convert term vectors into gensim dictionary

#pip install gensim

import gensim
dict = gensim.corpora.Dictionary( stem_words)

corp = [ ]

for i in range( 0, len( stem_words ) ):

    corp.append( dict.doc2bow( stem_words[ i ] ) )
#  Create TFIDF vectors based on term vectors bag-of-word corpora

tfidf_model = gensim.models.TfidfModel( corp )

tfidf = [ ]

for i in range( 0, len( corp ) ):

    tfidf.append( tfidf_model[ corp[ i ] ] )

#  Create pairwise document similarity index
n = len( dict )

index = gensim.similarities.SparseMatrixSimilarity( tfidf_model[ corp ], num_features = n )


In [17]:

#  Print TFIDF vectors and pairwise similarity per document

for i in range( 0, len( tfidf ) ):

    s = 'Doc ' + str( i + 1 ) + ' TFIDF:'
    
    for j in range( 0, len( tfidf[ i ] ) ):

        s = s + ' (' + dict.get( tfidf[ i ][ j ][ 0 ] ) + ','

        s = s + ( '%.3f' % tfidf[ i ][ j ][ 1 ] ) + ')'

    print(s)



for i in range( 0, len( corp ) ):

    print( 'Doc', ( i + 1 ), 'sim: [ '),


    sim = index[ tfidf_model[ corp[ i ] ] ]

    for j in range( 0, len( sim ) ):

        print( '%.3f ' % sim[ j ]),

    print (']')

### Molly Attemptiong Textatistic Readability

##### something we can do with this - are the people leaving good reviews "smart" and people leaving bad reviews "dumb" ? ie - are the bad reviews trustworthy?


In [20]:
#run pip install textatistic in anaconda prompt
from textatistic import Textatistic

In [35]:
sample_text = 'Molly is testing this method out. Is she smart or stupid? Only one way to find out. Throwing in some fancy words to sound like an intellectual: photosynthesis monotheism epiglottis deviation.'  

# Create a Textatistic object
s = Textatistic(sample_text)

s.counts

{'char_count': 161,
 'word_count': 31,
 'sent_count': 4,
 'sybl_count': 45,
 'notdalechall_count': 8,
 'polysyblword_count': 3}

### get my readability score

##### 100.00-90.00 	5th grade 	Very easy to read. Easily understood by an average 11-year-old student.
##### 90.0–80.0 	6th grade 	Easy to read. Conversational English for consumers.
##### 80.0–70.0 	7th grade 	Fairly easy to read.
##### 70.0–60.0 	8th & 9th grade 	Plain English. Easily understood by 13- to 15-year-old students.
##### 60.0–50.0 	10th to 12th grade 	Fairly difficult to read.
##### 50.0–30.0 	College 	Difficult to read.
###### 30.0–0.0 	College graduate 	Very difficult to read. Best understood by university graduates. 


In [36]:
s.flesch_score

76.16229838709678

#### making tiny dataset to practice looping reviews through

In [52]:
raw_small = raw.head(3)

In [55]:
#smart = Textatistic(raw_small['REVIEW'])


for i in raw_small['REVIEW']:
    s = Textatistic(i)
    print(s.flesch_score)


ZeroDivisionError: division by zero