In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import gensim
import matplotlib.pyplot as plt
import re
import pickle 
# Initializing the train model
from gensim.models import word2vec
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')

In [59]:
with open('df_final.pkl','rb') as read_file:
    df_final = pickle.load(read_file)

In [60]:
len(df_final)

294894

In [61]:
# checked duplicates, no duplicated reviews identified
df_final.drop_duplicates(subset=['comments'], inplace = True)

## Train Word2Vec Model

In [62]:
reviews_comments = df_final.iloc[:,5]

In [63]:
def ProcessTextforSentiment(text):
    text_1 = [re.sub("[^a-zA-Z]"," ", review) for review in text]
    text_2 = [text.lower().split() for text in text_1]
    return text_2
#     text_2 = [text.split() for text in text_1]
#     return text_2

In [64]:
review_text_split_lower = ProcessTextforSentiment(reviews_comments)

In [66]:
bigram = gensim.models.Phrases(review_text_split_lower, min_count=10) # min_count ignore all words and bigrams with total collected count lower than this value. higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[review_text_split_lower], min_count=5)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [67]:
review_text_1 = [trigram_model[bigram_model[t]] for t in review_text_split_lower]

In [68]:
len(review_text_1)

294894

In [69]:
with open('review_text_1_adjusted.pkl', 'wb') as f:
    pickle.dump(review_text_1, f)

In [70]:
# after process the text, use it to create the word2vec model
# the idea behind word2vec is simple: it is analogous to "show me your friends, and i'll tell who you are" 
# if two words have very similar neighbors, then these words are probably quite similar in meaning
# assumption is that the meaning of a word can be inferred by the company it keeps
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 7     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

#print("Training model....")
model_word2vec = word2vec.Word2Vec(review_text_1,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model_word2vec.init_sims(replace=True)

In [71]:
# Saving the model for later use. Can be loaded using Word2Vec.load()

model_name = "300features_40minwords_10context_no_stopword_adjusted"
model_word2vec.save(model_name)

In [72]:
model_word2vec.wv.vectors.shape

(10414, 300)

In [73]:
#transform each reveiw into a 300 D vector

# from gensim.models import Word2Vec
# model_word2vec = Word2Vec.load("300features_40minwords_10context_no_stopword")

In [74]:
#transform each reveiw into a 300 D vector
index2word_set =  set(model_word2vec.wv.index2word)

# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features, index2word_set):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0

    for word in  words: # loop over each word string in review
        if word in index2word_set: # determine if the word is in 300 word features
            nwords = nwords + 1 # if yes, then add 1 to the total number of words for each review
            featureVec = np.add(featureVec,model[word]) # add model results of this word to featureVec (this will form a sum of model results of each word for each review)
#             print(featureVec.shape) 
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)  
    return featureVec

In [75]:
# Function for calculating the average feature vector
num_features = 300
def getAvgFeatureVecs(reviews, model, num_features,index2word_set):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32") # create a (305913, 300) array
    for review in reviews: # loop over list of reviews:
        # Printing a status message every 1000th review
#         if counter%1000 == 0:
#             print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features, index2word_set) # call function featureVecMethod for each reivew  
        counter = counter+1 # count how many reviews get iterated through, used as index of review
        
    return reviewFeatureVecs

In [76]:
review_vector = getAvgFeatureVecs(review_text_1, model_word2vec, num_features, index2word_set) # not sure what does each score represent in word2Vec???? 


In [77]:
review_vector.shape

(294894, 300)

In [78]:
import pickle 

with open('review_vector_adjusted.pkl', 'wb') as f:
    pickle.dump(review_vector, f)

In [79]:
#have to fillin na for next step training
def ProcessWordVec(array):
    df = pd.DataFrame(array)
    column_mean= df.mean()
    df.fillna(column_mean, inplace = True)
    return df

In [80]:
X = ProcessWordVec(review_vector)

In [82]:
#our response variable is the star from our dataframe
y = df_final['review_scores_rating_adjusted']

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [124]:
from sklearn.linear_model import LogisticRegression

#from previous modeling use C =10

model_log = LogisticRegression(
  class_weight= 'balanced',
  solver='newton-cg',
  fit_intercept=True
  )

model_log.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [125]:
y_predict_log = model_log.predict(X_test)

In [128]:
from sklearn.metrics import f1_score
f1_score(y_test, y_predict_log, average='weighted')  

0.805935014859066

In [86]:
MSE = np.mean(np.square(y_predict_log - y_test))

In [87]:
MSE

0.26918492334251304

In [95]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

confusion_matrix(y_predict_log,y_test)

array([[ 4499,  1176,   808],
       [ 2215, 23399, 11147],
       [ 1198,  3634, 49240]])

In [None]:
# accuracy: 80% (77138 / 97316)

### Try different classification algorithm

In [110]:
# Multi-nomial naive bayes, it requires all X inputs are non-negative
# Normalize X
# Tried different alpha: 0, 1 and 10, the accuracy results are indifferent
# Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)

X_norm = (X - X.min()) / (X.max() - X.min())
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.33, random_state=42)

import numpy as np
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train,y_train)
MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

In [111]:
y_predict_bayes = clf.predict(X_test)

In [112]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

confusion_matrix(y_predict_bayes,y_test)

array([[ 3099,   459,    40],
       [    0,     0,     0],
       [ 4813, 27750, 61155]])

In [106]:
# Accuracy is 64%

### Clustering

In [113]:
with open('clean_sents_adjusted.pkl','rb') as read_file:
    clean_sents = pickle.load(read_file)

In [None]:
#remove hosts name (there are a lot commonly used names)!!!!

# stoplist = set('jenny tara jen sebastian micheal barbara scott lauren rick brett'.split())
# clean_sents = [[word for word in text if word not in stoplist]
#          for text in clean_sents]

In [114]:
# use word2Vec model to assign density score to review by sentences
sentence_vector_no_stopword = getAvgFeatureVecs(clean_sents, model_word2vec, num_features, index2word_set)

In [115]:
sentence_vector_no_stopword_df = ProcessWordVec(sentence_vector_no_stopword)

In [120]:
#The sentence_vector_no_stopword_df is used to cluster(??? i used the sentence with stop word to cluster)
from sklearn.cluster import KMeans
km_300_10 = KMeans(n_clusters=20)
km_300_10.fit(sentence_vector_no_stopword_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [121]:
#similar_by_vector(vector, topn=10, restrict_vocab=None)
def Get_centroid_word(centroids,model):
    word_list = []
    for vector in centroids:
        words = model.similar_by_vector(vector, topn=10)
        word_list.append(words)
        
    return word_list

In [122]:
centroids_300_10 = km_300_10.cluster_centers_
topics_300_10 = Get_centroid_word(centroids_300_10,model_word2vec)