In [14]:
import itertools
## Regular Expression
import re

## Arrays
import numpy as np

## DataFrames
import pandas as pd

## Visualizations
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.colors as colors
%matplotlib inline

## Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from gensim.models import Word2Vec
from tqdm import tqdm

import contractions
import spacy
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [16]:
df = pd.DataFrame()
df = pd.read_csv('data_capstone_2/nlp_reviews_cleaned.csv', delimiter=',', encoding='utf-8')

df['rating'] = df['rating'].apply(lambda x: 0 if x <= 2 else 1)
print(df.rating.value_counts())

df = df.drop(['Unnamed: 0', 'customer', 'product', 'time', 'pos_feedback',
       'neg_feedback'], axis=1)

1    12080
0     1192
Name: rating, dtype: int64


In [17]:
df.review_text[150]

"Works, but I will never use it again I managed to get this stuff wrapped around all my fruit trees last year (quite a job - it snagged on all the branches and even my buttons, but with the help of another person and two long poles, we did it.)  I guess it worked well enough but I will never use it again since it doesn't just keep birds away, it traps them.  I managed to spot and free a bluebird and a cardinal that had become entangled, before they died of thirst or were found by cats.  But then one day I found a bluebird all dried up hanging from the netting - poor thing, what a horrible way to die.  :(  I want to protect my fruit, but I don't want to kill songbirds, certainly not slowly over a period of days.  So I am going to try some of those products that are supposed to merely scare them away, not kill them."

In [18]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

corpus = list()
lines = df['review_text'].values.tolist()

for line in lines:   
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    corpus.append(words)

In [19]:
lines

['Great Hoses Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.',
 'Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch by 50-Foot, Green This is a high quality 8 ply hose. I have had good luck with Gilmour hoses in the past. A good choice in hoses.',
 "Very satisfied! It's probably one of the best hoses I've ever had.Pro's:- It's good enough for most front yards, or small back yards.- It has enough flow (and with the right head) to water plants 25ft away from the nozzle, and water your garden.- It's light weight, and flexible (it does not pose as much of a problem unrolling this hose with kinks as with 

In [20]:
df_train = df[:10000]
df_test = df[10000:]

In [22]:
tmp_corpus = df_train['review_text'].map(lambda x: x.split('.'))

# corpus [[w1,w2,w3..],[..]]
corpus = []
for i in tqdm(range(len(tmp_corpus))):
    for line in tmp_corpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|██████████| 10000/10000 [00:01<00:00, 6815.86it/s]


In [23]:
num_of_sentences = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

Num of sentences - 106846
Num of words - 1555881


In [8]:
# sg - skip gram |  window = size of the window | size = vector dimension
size = 300
window_size = 2 # sentences weren't too long, so
epochs = 100
min_count = 1
workers = 4

# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

In [9]:
model.most_similar('hose')

[('shutofforgilmour', 0.48590296506881714),
 ('shutoffjust', 0.46847108006477356),
 ('shutoffor', 0.46435868740081787),
 ('rem', 0.460856556892395),
 ('puwebhamz', 0.45755651593208313),
 ('inchbyfoot', 0.4468499422073364),
 ('amelnor', 0.44322195649147034),
 ('apex', 0.4362832009792328),
 ('uncurl', 0.43391701579093933),
 ('handgilmour', 0.4314667880535126)]

In [10]:
model.wv.syn0.shape

(25974, 300)

In [11]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary,
                                     num_features) for tokenized_sentence in corpus]
    return np.array(features)

In [12]:
train_corpus = df_train['clean_text'].apply(word_tokenize)

In [17]:
train_array = averaged_word_vectorizer(corpus=train_corpus, model=model,
                                             num_features=300)

In [18]:
test_corpus = df_test['clean_text'].apply(word_tokenize)

test_array = averaged_word_vectorizer(corpus=test_corpus, model=model,
                                             num_features=300)

In [19]:
y_test = df_test['rating_class']
y_train = df_train['rating_class']
clf = LogisticRegression(random_state=1)
clf.fit(train_array, y_train)
y_pred_clf = clf.predict(test_array)
cm = confusion_matrix(y_test, y_pred_clf)

print('1. Accuarcy: {}\n'.format(metrics.accuracy_score(y_test, y_pred_clf)))
print('2. The F-1 score of the model {}\n'.format(f1_score(y_test, y_pred_clf, average='weighted')))
print('3. The recall score of the model {}\n'.format(recall_score(y_test, y_pred_clf, average='weighted')))
print('4. Classification Report:\n{}\n5. Confusion matrix:\n{}\n\n\n'.format(
        classification_report(y_test, y_pred_clf), cm))

1. Accuarcy: 0.9150366748166259

2. The F-1 score of the model 0.8818906323979591

3. The recall score of the model 0.9150366748166259

4. Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.06      0.11       286
           1       0.92      1.00      0.96      2986

   micro avg       0.92      0.92      0.92      3272
   macro avg       0.78      0.53      0.54      3272
weighted avg       0.89      0.92      0.88      3272

5. Confusion matrix:
[[  18  268]
 [  10 2976]]



