### import data

#### save as dataframe called raw

In [27]:
import pandas as pd
from matplotlib import pyplot as plt
pd.options.mode.chained_assignment = None
from IPython.display import HTML
import numpy as np
import seaborn as sns
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
plt.style.use('fivethirtyeight')
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

In [115]:
raw = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
 
 
raw.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


### Data Cleaning

#### Drop and rename variables

In [116]:
raw = raw.drop([
'Unnamed: 0'], axis=1)

In [117]:
raw.rename(columns={
    'Clothing ID': 'ID',
    'Age': 'AGE',
    'Title': 'TITLE',
    'Review Text': 'REVIEW',
    'Rating': 'RATING',
    'Recommended IND': 'RECOMMENDED',
    'Positive Feedback Count': 'POS_FEEDBACK',
    'Division Name': 'DIVISION',
    'Department Name': 'DEPARTMENT',
    'Class Name': 'CLASS',}, inplace=True)

#### identify missing values - if REVIEW is blank then delete observation, otherwise fill/ignore

In [118]:
print(raw.isnull().sum())

ID                 0
AGE                0
TITLE           3810
REVIEW           845
RATING             0
RECOMMENDED        0
POS_FEEDBACK       0
DIVISION          14
DEPARTMENT        14
CLASS             14
dtype: int64


In [119]:
raw.CLASS.fillna(value='Unknown', inplace=True)
raw.DIVISION.fillna(value='Unknown', inplace=True)
raw.DEPARTMENT.fillna(value='Unknown', inplace=True)
raw.TITLE.fillna(value='Blank', inplace=True)

raw = raw.dropna()

In [120]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 10 columns):
ID              22641 non-null int64
AGE             22641 non-null int64
TITLE           22641 non-null object
REVIEW          22641 non-null object
RATING          22641 non-null int64
RECOMMENDED     22641 non-null int64
POS_FEEDBACK    22641 non-null int64
DIVISION        22641 non-null object
DEPARTMENT      22641 non-null object
CLASS           22641 non-null object
dtypes: int64(5), object(5)
memory usage: 1.9+ MB


#### creating Age Bucket

In [121]:
custom_bucket_array = np.linspace(10, 90, 10, endpoint=False)
custom_bucket_array

array([10., 18., 26., 34., 42., 50., 58., 66., 74., 82.])

In [123]:
raw['AGE_BUCKET'] = pd.cut(raw['AGE'], custom_bucket_array)
raw.head()


Unnamed: 0,ID,AGE,TITLE,REVIEW,RATING,RECOMMENDED,POS_FEEDBACK,DIVISION,DEPARTMENT,CLASS,AGE_BUCKET
0,767,33,Blank,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,"(26.0, 34.0]"
1,1080,34,Blank,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,"(26.0, 34.0]"
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,"(58.0, 66.0]"
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"(42.0, 50.0]"
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,"(42.0, 50.0]"


#### use code below to filter values as needed (class = dresses, age bracket, etc)

In [33]:
#raw = raw[(raw.CLASS == 'Dresses')
#            |(raw.RATING == 1)
#            |(raw.POS_FEEDBACK == 0)
#           ]  

### create word frequencies table
#### exclude stop words, ngram opens up to multi-word phrases (min, max), max features tells how many top words/phrases we want to see, lowercase

In [127]:
vect = CountVectorizer(stop_words='english'               
        , max_features = 20
        ,ngram_range=(1, 5)
        , lowercase =False
                      )
 
raw_review = vect.fit_transform(raw['REVIEW'].values.astype('U'))
 
 
#print(vect.get_feature_names()[-50:])
#vect.vocabulary_

In [125]:
review_words = pd.DataFrame(raw_review.toarray(), columns=vect.get_feature_names())
 
review_words.head(20)

Unnamed: 0,color,comfortable,cute,dress,fabric,fit,flattering,great,just,like,little,look,love,ordered,perfect,really,size,small,soft,wear
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,1,0,1,0,2,1,0,0,0,0,0,0
2,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,1,1,3,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1
5,1,0,0,3,0,0,0,0,2,0,0,0,2,0,0,0,0,1,0,1
6,2,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0
7,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0
8,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0
9,0,0,0,1,1,1,0,0,0,0,0,0,2,1,0,0,0,0,0,1


In [126]:
review_words.sum()

color           4605
comfortable     3060
cute            3042
dress          10567
fabric          4798
fit             7325
flattering      3519
great           6117
just            5608
like            7149
little          3775
look            4039
love            8951
ordered         3850
perfect         3774
really          3925
size            8772
small           4729
soft            3343
wear            6439
dtype: int64

### next steps (ran out of time)
#### install textblob for stemming, sentiment analysis, classification
#### TF-IDF (term frequency-inverse document frequency) to find meaningful words
#### naive bayes?