# Fundamentals of Machine Learning
## Kavindu Jointe | IMDD-D02
Master Data-Driven Design

University of Applied Science Utrecht

Week 6: Text Mining 

In [2]:
# importing all the relevant libraries
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
# loading data set
df = pd.read_csv('Womens_Clothing_E-Commerce_Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [32]:
# checking out how many rows and columns are there
df.shape

(23486, 11)

In [33]:
# for this learning assignment we only need two columns review text to analyze the sentiment and Rating to know if the review is postive or negative
impDf = df[['Review Text','Rating']]

In [34]:
# dropping all the null values
impDf = impDf.dropna()

In [36]:
# checking many rows are left after dropping all the nun values
impDf.shape

(22641, 2)

In [37]:
# making new column to create binaries values for learning
impDf['sentiment'] = 0

In [38]:
# assigning 0 to bad or neutral reviews and assigning 1 to good reviews in sentiment columns
for index,row in impDf.iterrows():
    rating = row['Rating']
    if rating < 4:
      impDf.loc[index,'sentiment'] = 0
    elif rating > 3:
      impDf.loc[index,'sentiment'] = 1

In [39]:
# checking if the values got assigned correctly
impDf.head()

Unnamed: 0,Review Text,Rating,sentiment
0,Absolutely wonderful - silky and sexy and comf...,4,1
1,Love this dress! it's sooo pretty. i happene...,5,1
2,I had such high hopes for this dress and reall...,3,0
3,"I love, love, love this jumpsuit. it's fun, fl...",5,1
4,This shirt is very flattering to all due to th...,5,1


In [40]:
# creating a target variable
y = impDf['sentiment']

In [41]:
# creating x training value and vectorizing the dependent variables for NLP
tf = text.TfidfVectorizer()
X = tf.fit_transform(impDf['Review Text'])
print(X.shape)


(22641, 14145)


In [42]:
# splitting the data set for training and testing
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=.2)

In [50]:
# using bernoulli naive bayes from sklearn
cloth1 = ms.GridSearchCV(
    nb.BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
cloth1.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([1.00000000e-02, 1.20679264e-02, 1.45634848e-02, 1.75751062e-02,
       2.12095089e-02, 2.55954792e-02, 3.08884360e-02, 3.72759372e-02,
       4.49843267e-02, 5.42867544e-02, 6.55128557e-02, 7.90604321e-02...
       4.09491506e+00, 4.94171336e+00, 5.96362332e+00, 7.19685673e+00,
       8.68511374e+00, 1.04811313e+01, 1.26485522e+01, 1.52641797e+01,
       1.84206997e+01, 2.22299648e+01, 2.68269580e+01, 3.23745754e+01,
       3.90693994e+01, 4.71486636e+01, 5.68986603e+01, 6.86648845e+01,
       8.28642773e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [51]:
# using Multinomial naive bayes from sklearn
cloth = ms.GridSearchCV(
    nb.MultinomialNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
cloth.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([1.00000000e-02, 1.20679264e-02, 1.45634848e-02, 1.75751062e-02,
       2.12095089e-02, 2.55954792e-02, 3.08884360e-02, 3.72759372e-02,
       4.49843267e-02, 5.42867544e-02, 6.55128557e-02, 7.90604321e-02,
       9.54095476...
       4.09491506e+00, 4.94171336e+00, 5.96362332e+00, 7.19685673e+00,
       8.68511374e+00, 1.04811313e+01, 1.26485522e+01, 1.52641797e+01,
       1.84206997e+01, 2.22299648e+01, 2.68269580e+01, 3.23745754e+01,
       3.90693994e+01, 4.71486636e+01, 5.68986603e+01, 6.86648845e+01,
       8.28642773e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [52]:
# testing the (multi Nomial)
cloth.score(X_test, y_test)


0.8542724663281077

In [53]:
# testing the (bernoulli)
cloth1.score(X_test, y_test)

0.8564804592625304

What we observe is that both model almost produced a similar score. And gave the right prediction almost 85% of the time.

In [54]:
names = np.asarray(tf.get_feature_names())
# Next, we display the 50 words with the largest
# coefficients.
print(','.join(names[np.argsort(cloth.best_estimator_.coef_[0, :])[::-1][:50]]))

the,it,and,is,this,to,in,with,for,dress,love,but,on,so,of,great,my,size,very,top,that,was,wear,are,fit,not,or,have,be,as,they,perfect,am,like,you,up,comfortable,color,too,can,me,just,small,soft,flattering,little,these,fits,well,will


In [55]:
# predicting the result of one sentence
print(cloth.predict(tf.transform(["The dress look beautiful"])))

[1]
