In [2]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

import gensim

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

#Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD #LSA
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora, models, similarities, matutils #LDA

#Pipeline
from sklearn.pipeline import Pipeline

#Bayes Optimization Parameter Tuner
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

#Visualization
import matplotlib.pyplot as plt

#For Handling Imbalanced Data for Classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
import numpy as np

#For Classification
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize  #not needed b/c filtered out neutral ratings

Using TensorFlow backend.


In [3]:
full_df = pd.read_pickle('full_df_preprocessed.pkl')

In [3]:
full_df

Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed
0,"Disney, WHAT. HAVE. YOU. DONE Just to be clea...",lionking,imdb,1,negative,"[disney, clear, time, favorite, movie, not, st...",disney clear time favorite movie not stress en...
1,No soul. The original Lion King is one of my ...,lionking,imdb,1,negative,"[no, soul, original, lion, king, favorite, mov...",no soul original lion king favorite movie time...
5,Seriously? So anyone else notice it has a hig...,lionking,imdb,1,negative,"[seriously, notice, high, score, 7.5, rating, ...",seriously notice high score 7.5 rating not str...
6,Overrated and way too much spotlight on beyon...,lionking,imdb,1,negative,"[overrated, way, spotlight, beyonce, lion, kin...",overrated way spotlight beyonce lion king only...
8,Terrible acting!! Doesn't compare to the orig...,lionking,imdb,1,negative,"[terrible, act, not, compare, original, love, ...",terrible act not compare original love origina...
...,...,...,...,...,...,...,...
3040,A magically wonderful film filled with adventu...,cinderella,rottentomatoes,5,positive,"[magically, wonderful, film, fill, adventure, ...",magically wonderful film fill adventure fantas...
3041,Disney has overdid the faithfulness of their o...,cinderella,rottentomatoes,4,positive,"[disney, overdo, faithfulness, animate, classi...",disney overdo faithfulness animate classic pro...
3042,Magic....that's about right. A re-tell of the ...,cinderella,rottentomatoes,4,positive,"[magic, ...., right, tell, original, disney, m...",magic .... right tell original disney movie li...
3043,A good movie that sets it apart from the origi...,cinderella,rottentomatoes,4,positive,"[good, movie, set, apart, original, story, cin...",good movie set apart original story cinderella...


In [4]:
#change sentiment to numbers as classifiers 
full_df['sentiment'].replace('positive',1,inplace=True)
full_df['sentiment'].replace('negative',0,inplace=True)
full_df = full_df[(full_df.sentiment != 'neutral')]

In [5]:
full_df['sentiment'].value_counts()

1    38656
0     5941
Name: sentiment, dtype: int64

# Train-Test Split for Supervised Learning

In [6]:
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)

In [7]:
#very imbalanced--many positive reviews 
y_train.value_counts()

1    30928
0     4749
Name: sentiment, dtype: int64

In [16]:
corpus = X_train.iloc[:,0].tolist() #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10, n_iter=10))])
TFIDF_LSA = Pipeline_TFIDF_LSA.fit_transform(corpus)

#Need to normalize before classifying to predict positive vs. negative sentiment
X_train_TFIDF_LSA_normalized = normalize(TFIDF_LSA)
X_train_TFIDF_LSA_normalized.shape

#will need to normalize test set separately after doing same pipeline steps--> normalize --> stdscale TRANSFORM ONLY on test using fitted stdscale from train

(35677, 5)

In [18]:
corpus = X_train.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=5,random_state=10))])
TFIDF_NMF = Pipeline_TFIDF_NMF.fit_transform(corpus)

#Need to normalize before classifying to predict positive vs. negative sentiment
X_train_TFIDF_NMF_normalized = normalize(TFIDF_NMF)
X_train_TFIDF_NMF_normalized.shape

(35677, 5)

In [20]:
X_train_TFIDF_LSA = pd.DataFrame(X_train_TFIDF_LSA_normalized,columns=['topic1','topic2','topic3','topic4','topic5'])
X_train_TFIDF_LSA 

Unnamed: 0,topic0,topic1,topic2,topic3,topic4
0,0.618823,-0.164859,-0.299351,0.664971,-0.241003
1,0.822541,-0.060692,-0.362104,0.270914,-0.339454
2,0.611932,-0.658959,0.214088,0.254041,-0.284503
3,0.935221,-0.263925,-0.099187,-0.148634,-0.154195
4,0.574973,0.252747,-0.486537,0.601867,0.081013
...,...,...,...,...,...
35672,0.506348,0.094750,-0.185527,0.604422,0.578696
35673,0.810796,0.031377,-0.382283,0.282402,-0.340197
35674,0.630164,-0.597997,0.118572,0.354982,-0.324379
35675,0.840179,-0.361880,0.176981,-0.030525,-0.361784


In [22]:
X_train_TFIDF_NMF = pd.DataFrame(X_train_TFIDF_NMF_normalized,columns=['topic1','topic2','topic3','topic4','topic5'])
X_train_TFIDF_NMF


Unnamed: 0,topic1,topic2,topic3,topic4,topic5
0,0.469244,0.000000,0.000000,0.883068,0.000000
1,0.568487,0.364562,0.000000,0.737508,0.000000
2,0.999926,0.000000,0.010350,0.000000,0.006360
3,0.741145,0.602491,0.000000,0.156908,0.251174
4,0.083206,0.399400,0.000000,0.912993,0.000000
...,...,...,...,...,...
35672,0.078081,0.144820,0.125787,0.890371,0.405398
35673,0.502497,0.390324,0.000000,0.771456,0.000000
35674,0.997735,0.000000,0.000000,0.067262,0.000000
35675,0.915575,0.344054,0.208201,0.001029,0.000000


In [12]:
#for some reason, y_train has datatype "unknown" --> have to convert to integer
y_train = y_train.astype(int)

# Bayes Optimization for Hyperparameter Tuning Logistic Regression

**LSA**

In [25]:
scaled_X_train_TFIDF_LSA = StandardScaler().fit_transform(X_train_TFIDF_LSA)

space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}

def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_train_TFIDF_LSA, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }

MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [21:45<00:00,  2.61s/it, best loss: 0.2200161293137115]


{'loss': 0.2200161293137115,
 'params': {'C': 10, 'penalty': 'l1', 'solver': 'saga'},
 'status': 'ok'}

In [82]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_subtrain = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10,n_iter=10))])
TFIDF_LSA_subtrain = Pipeline_TFIDF_LSA_subtrain.fit_transform(subtrain_corpus)

X_subtrain_TFIDF_LSA_normalized = normalize(TFIDF_LSA_subtrain)
ss = StandardScaler()
scaled_subtrain_TFIDF_LSA = ss.fit_transform(X_subtrain_TFIDF_LSA_normalized)



###VALID
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10,n_iter=10))])
TFIDF_LSA_valid = Pipeline_TFIDF_LSA_valid.fit_transform(valid_corpus)

X_valid_TFIDF_LSA_normalized = normalize(TFIDF_LSA_valid)
scaled_valid_TFIDF_LSA = ss.transform(X_valid_TFIDF_LSA_normalized)

logreg_classwt_LSA = LogisticRegression(C=10, penalty='l1', solver='saga', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF, y_subtrain)
y_pred_classwt_LSA_subtrain = logreg_classwt_LSA.predict(scaled_subtrain_TFIDF_LSA)
y_pred_classwt_LSA_valid = logreg_classwt_LSA.predict(scaled_valid_TFIDF_LSA)

print(roc_auc_score(y_subtrain, y_pred_classwt_LSA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LSA_valid))
logreg_classwt_LSA.coef_

0.5233835789469081
0.5563100961538461


array([[-0.61245935,  0.08845889,  0.64760042,  0.55574021,  0.09554249]])

**NMF**

In [24]:
scaled_X_train_TFIDF_NMF = StandardScaler().fit_transform(X_train_TFIDF_NMF)

space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_train_TFIDF_NMF, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [03:39<00:00,  2.28it/s, best loss: 0.2346180047390709]


{'loss': 0.2346180047390709,
 'params': {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'},
 'status': 'ok'}

In [83]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_subtrain = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=5,random_state=10))])
TFIDF_NMF_subtrain = Pipeline_TFIDF_NMF_subtrain.fit_transform(subtrain_corpus)

X_subtrain_TFIDF_NMF_normalized = normalize(TFIDF_NMF_subtrain)
ss = StandardScaler()
scaled_subtrain_TFIDF_NMF = ss.fit_transform(X_subtrain_TFIDF_NMF_normalized)



###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=5,random_state=10))])
TFIDF_NMF_valid = Pipeline_TFIDF_NMF_valid.fit_transform(valid_corpus)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF = LogisticRegression(C=0.01, penalty='l2', solver='liblinear', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF, y_subtrain)
y_pred_classwt_NMF_subtrain = logreg_classwt_NMF.predict(scaled_subtrain_TFIDF_NMF)
y_pred_classwt_NMF_valid = logreg_classwt_NMF.predict(scaled_valid_TFIDF_NMF)

print(roc_auc_score(y_subtrain, y_pred_classwt_NMF_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid))
logreg_classwt_NMF.coef_

0.7086049627353526
0.7035141941391941


array([[-0.60523788,  0.08619265,  0.61950938,  0.53780974,  0.09247201]])

**LDA**

In [118]:
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)
y_train = y_train.astype(int)

###SUBTRAIN
corpus = X_train.iloc[:,0].tolist()

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec = tfidfvec.fit_transform(corpus)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus = matutils.Sparse2Corpus(doc_word_tfidfvec.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=gensim_corpus, num_topics=5, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[gensim_corpus]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

tuples_doc_topic_df = pd.DataFrame(lda_docs,columns=['topic1','topic2','topic3','topic4','topic5'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)

X_train_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
ss = StandardScaler()
scaled_train_TFIDF_LDA = ss.fit_transform(X_train_TFIDF_LDA_normalized)

lda.print_topics(num_words=30)

[(0,
  '0.095*"not" + 0.081*"like" + 0.061*"awesome" + 0.048*"feel" + 0.044*"animal" + 0.043*"original" + 0.040*"little" + 0.039*"real" + 0.039*"voice" + 0.038*"make" + 0.038*"character" + 0.036*"good" + 0.036*"look" + 0.032*"no" + 0.032*"cgi" + 0.028*"only" + 0.023*"way" + 0.022*"actor" + 0.021*"story" + 0.021*"scene" + 0.020*"act" + 0.020*"well" + 0.018*"song" + 0.016*"watch" + 0.015*"think" + 0.014*"great" + 0.014*"go" + 0.014*"time" + 0.013*"amaze" + 0.011*"cast"'),
 (1,
  '0.082*"well" + 0.069*"disney" + 0.055*"live" + 0.053*"action" + 0.052*"not" + 0.048*"time" + 0.042*"live action" + 0.042*"lion" + 0.040*"king" + 0.040*"original" + 0.038*"go" + 0.037*"remake" + 0.034*"think" + 0.033*"watch" + 0.025*"good" + 0.025*"version" + 0.024*"make" + 0.021*"animate" + 0.019*"new" + 0.018*"classic" + 0.017*"love" + 0.016*"story" + 0.016*"like" + 0.014*"way" + 0.012*"scene" + 0.012*"character" + 0.011*"cast" + 0.011*"song" + 0.010*"great" + 0.009*"feel"'),
 (2,
  '0.151*"great" + 0.094*"enjo

In [119]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}

def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_train_TFIDF_LDA, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }

MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [02:15<00:00,  3.69it/s, best loss: 0.27464862634403475]


{'loss': 0.27464862634403475,
 'params': {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'},
 'status': 'ok'}

In [128]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
corpus_subtrain = X_subtrain.iloc[:,0].tolist()

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_subtrain = tfidfvec.fit_transform(corpus_subtrain)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_subtrain = matutils.Sparse2Corpus(doc_word_tfidfvec_subtrain.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_subtrain = models.LdaModel(corpus=gensim_corpus_subtrain, num_topics=5, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_subtrain = lda_subtrain[gensim_corpus_subtrain]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_subtrain = [doc for doc in lda_corpus_subtrain]

tuples_doc_topic_df = pd.DataFrame(lda_docs_subtrain,columns=['topic1','topic2','topic3','topic4','topic5'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)

X_subtrain_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
ss = StandardScaler()
scaled_subtrain_TFIDF_LDA = ss.fit_transform(X_subtrain_TFIDF_LDA_normalized)


###VALID

corpus_valid = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_valid = tfidfvec.fit_transform(corpus_valid)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_valid = matutils.Sparse2Corpus(doc_word_tfidfvec_valid.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_valid = models.LdaModel(corpus=gensim_corpus_valid, num_topics=5, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_valid = lda_valid[gensim_corpus_valid]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_valid = [doc for doc in lda_corpus_valid]

tuples_doc_topic_df = pd.DataFrame(lda_docs_valid,columns=['topic1','topic2','topic3','topic4','topic5'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)

X_valid_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
scaled_valid_TFIDF_LDA = ss.transform(X_valid_TFIDF_LDA_normalized)


logreg_classwt_LDA = LogisticRegression(C=0.001, penalty='l2', solver='liblinear', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_LDA, y_subtrain)
y_pred_classwt_LDA_subtrain = logreg_classwt_LDA.predict(scaled_subtrain_TFIDF_LDA)
y_pred_classwt_LDA_valid = logreg_classwt_LDA.predict(scaled_valid_TFIDF_LDA)

print(roc_auc_score(y_subtrain, y_pred_classwt_LDA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LDA_valid))
logreg_classwt_LDA.coef_

0.6690377825461605
0.659005837912088


array([[-0.35001732, -0.14803274,  0.18460698,  0.18980734,  0.49158142]])

# TUNING

In [122]:
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)
y_train = y_train.astype(int)
corpus_NMF_tuning = X_train.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_tuning = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=14,random_state=10))])
TFIDF_NMF_tuning = Pipeline_TFIDF_NMF_tuning.fit_transform(corpus_NMF_tuning)

#Need to normalize before classifying to predict positive vs. negative sentiment
X_train_TFIDF_NMF_normalized_tuning = normalize(TFIDF_NMF_tuning)


X_train_TFIDF_NMF_tuning = pd.DataFrame(X_train_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
#,'topic15','topic16','topic17','topic18','topic19','topic20'

In [123]:
scaled_X_train_TFIDF_NMF_tuning = StandardScaler().fit_transform(X_train_TFIDF_NMF_tuning)

space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_train_TFIDF_NMF_tuning, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [03:14<00:00,  2.57it/s, best loss: 0.1873046887026517]


{'loss': 0.1873046887026517,
 'params': {'C': 0.05, 'penalty': 'l1', 'solver': 'liblinear'},
 'status': 'ok'}

In [127]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_subtrain_tuning = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=14,random_state=10))])
TFIDF_NMF_subtrain_tuning = Pipeline_TFIDF_NMF_subtrain_tuning.fit_transform(subtrain_corpus)

X_subtrain_TFIDF_NMF_normalized_tuning= normalize(TFIDF_NMF_subtrain_tuning)
X_subtrain_TFIDF_NMF_tuning = pd.DataFrame(X_subtrain_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])

ss = StandardScaler()
scaled_subtrain_TFIDF_NMF_tuning = ss.fit_transform(X_subtrain_TFIDF_NMF_tuning)



###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=14,random_state=10))])
TFIDF_NMF_valid = Pipeline_TFIDF_NMF_valid.fit_transform(valid_corpus)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF_tuning = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF_tuning = LogisticRegression(C=0.05, penalty='l1', solver='liblinear', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF_tuning, y_subtrain)
y_pred_classwt_NMF_subtrain_tuning = logreg_classwt_NMF_tuning.predict(scaled_subtrain_TFIDF_NMF_tuning)
y_pred_classwt_NMF_valid_tuning = logreg_classwt_NMF_tuning.predict(scaled_valid_TFIDF_NMF_tuning)

print(roc_auc_score(y_subtrain, y_pred_classwt_NMF_subtrain_tuning))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid_tuning))
logreg_classwt_NMF_tuning.coef_

0.7335070251879716
0.7386733058608058


array([[-0.58955892,  0.02189319,  0.69993711,  0.48412397,  0.20003813,
         0.74054519, -0.26909712, -0.05055338,  0.08353921,  0.33712804,
        -0.16208316, -0.11364424, -0.19890322,  0.35904309]])

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [146]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

TFIDF_vec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
TFIDF_subtrain_tuning = TFIDF_vec.fit_transform(subtrain_corpus)
NMF_model = NMF(n_components=14,random_state=10)
TFIDF_NMF_subtrain_tuning = NMF_model.fit_transform(TFIDF_subtrain_tuning)

X_subtrain_TFIDF_NMF_normalized_tuning= normalize(TFIDF_NMF_subtrain_tuning)
X_subtrain_TFIDF_NMF_tuning = pd.DataFrame(X_subtrain_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])

ss = StandardScaler()
scaled_subtrain_TFIDF_NMF_tuning = ss.fit_transform(X_subtrain_TFIDF_NMF_tuning)



###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
TFIDF_valid = TfidfVectorizer(ngram_range=(1,5), min_df=0.05).fit_transform(valid_corpus)
TFIDF_NMF_valid = NMF(n_components=14,random_state=10).fit_transform(TFIDF_valid)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF_tuning = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF_tuning = LogisticRegression(C=0.05, penalty='l1', solver='liblinear', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF_tuning, y_subtrain)
y_pred_classwt_NMF_subtrain_tuning = logreg_classwt_NMF_tuning.predict(scaled_subtrain_TFIDF_NMF_tuning)
y_pred_classwt_NMF_valid_tuning = logreg_classwt_NMF_tuning.predict(scaled_valid_TFIDF_NMF_tuning)

print(roc_auc_score(y_subtrain, y_pred_classwt_NMF_subtrain_tuning))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid_tuning))
logreg_classwt_NMF_tuning.coef_

0.7335070251879716
0.7386733058608058


array([[-0.58955892,  0.02189319,  0.69993711,  0.48412397,  0.20003813,
         0.74054519, -0.26909712, -0.05055338,  0.08353921,  0.33712804,
        -0.16208316, -0.11364424, -0.19890322,  0.35904309]])

In [148]:
display_topics(display_topics(NMF_model, TFIDF_vec.get_feature_names, 10))


Topic  0


TypeError: 'method' object is not subscriptable

In [162]:
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)
y_train = y_train.astype(int)
corpus_NMF_tuning = X_train.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_tuning = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=16,random_state=10))])
TFIDF_NMF_tuning = Pipeline_TFIDF_NMF_tuning.fit_transform(corpus_NMF_tuning)

#Need to normalize before classifying to predict positive vs. negative sentiment
X_train_TFIDF_NMF_normalized_tuning = normalize(TFIDF_NMF_tuning)


X_train_TFIDF_NMF_tuning = pd.DataFrame(X_train_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14','topic15','topic16'])
#,'topic11','topic12','topic13','topic14','topic15','topic16','topic17','topic18','topic19','topic20'

In [163]:
scaled_X_train_TFIDF_NMF_tuning = StandardScaler().fit_transform(X_train_TFIDF_NMF_tuning)

space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_train_TFIDF_NMF_tuning, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [03:45<00:00,  2.22it/s, best loss: 0.1825314055824817] 


{'loss': 0.1825314055824817,
 'params': {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'},
 'status': 'ok'}

In [161]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

TFIDF_vec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
TFIDF_subtrain_tuning = TFIDF_vec.fit_transform(subtrain_corpus)
NMF_model = NMF(n_components=16,random_state=10)
TFIDF_NMF_subtrain_tuning = NMF_model.fit_transform(TFIDF_subtrain_tuning)

X_subtrain_TFIDF_NMF_normalized_tuning= normalize(TFIDF_NMF_subtrain_tuning)
X_subtrain_TFIDF_NMF_tuning = pd.DataFrame(X_subtrain_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14','topic15','topic16'])

ss = StandardScaler()
scaled_subtrain_TFIDF_NMF_tuning = ss.fit_transform(X_subtrain_TFIDF_NMF_tuning)



###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
TFIDF_valid = TfidfVectorizer(ngram_range=(1,5), min_df=0.05).fit_transform(valid_corpus)
TFIDF_NMF_valid = NMF(n_components=16,random_state=10).fit_transform(TFIDF_valid)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF_tuning = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF_tuning = LogisticRegression(C=0.01, penalty='l2', solver='liblinear', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF_tuning, y_subtrain)
y_pred_classwt_NMF_subtrain_tuning = logreg_classwt_NMF_tuning.predict(scaled_subtrain_TFIDF_NMF_tuning)
y_pred_classwt_NMF_valid_tuning = logreg_classwt_NMF_tuning.predict(scaled_valid_TFIDF_NMF_tuning)

print(roc_auc_score(y_subtrain, y_pred_classwt_NMF_subtrain_tuning))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid_tuning))
logreg_classwt_NMF_tuning.coef_

0.7318905158280349
0.6892427884615384


array([[-0.62583928,  0.04444993,  0.63111498,  0.43729134,  0.16498511,
         0.64590941, -0.27438959, -0.2349948 ,  0.15317076,  0.3100709 ]])

In [None]:
C_range=[0.001,0.05, 0.01,0.1,0.5,1,5, 10,100,500,1000,5000, 10000]
solvers=['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear']


In [None]:
sklearn.metrics.SCORERS.keys()

In [166]:
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)
y_train = y_train.astype(int)

###SUBTRAIN
corpus = X_train.iloc[:,0].tolist()

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec = tfidfvec.fit_transform(corpus)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus = matutils.Sparse2Corpus(doc_word_tfidfvec.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=gensim_corpus, num_topics=14, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[gensim_corpus]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

tuples_doc_topic_df = pd.DataFrame(lda_docs,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)
tuples_doc_topic_df['topic6'] = tuples_doc_topic_df['topic6'].str[1].astype(float)
tuples_doc_topic_df['topic7'] = tuples_doc_topic_df['topic7'].str[1].astype(float)
tuples_doc_topic_df['topic8'] = tuples_doc_topic_df['topic8'].str[1].astype(float)
tuples_doc_topic_df['topic9'] = tuples_doc_topic_df['topic9'].str[1].astype(float)
tuples_doc_topic_df['topic10'] = tuples_doc_topic_df['topic10'].str[1].astype(float)
tuples_doc_topic_df['topic11'] = tuples_doc_topic_df['topic11'].str[1].astype(float)
tuples_doc_topic_df['topic12'] = tuples_doc_topic_df['topic12'].str[1].astype(float)
tuples_doc_topic_df['topic13'] = tuples_doc_topic_df['topic13'].str[1].astype(float)
tuples_doc_topic_df['topic14'] = tuples_doc_topic_df['topic14'].str[1].astype(float)


X_train_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
ss = StandardScaler()
scaled_train_TFIDF_LDA = ss.fit_transform(X_train_TFIDF_LDA_normalized)

lda.print_topics(num_words=30)

[(0,
  '0.494*"awesome" + 0.391*"well" + 0.047*"original" + 0.043*"love" + 0.019*"disney" + 0.003*"make" + 0.001*"not" + 0.000*"song" + 0.000*"smith" + 0.000*"character" + 0.000*"music" + 0.000*"job" + 0.000*"cast" + 0.000*"think" + 0.000*"remake" + 0.000*"great" + 0.000*"good" + 0.000*"loved" + 0.000*"watch" + 0.000*"story" + 0.000*"like" + 0.000*"voice" + 0.000*"new" + 0.000*"cgi" + 0.000*"time" + 0.000*"way" + 0.000*"kid" + 0.000*"go" + 0.000*"amaze" + 0.000*"animate"'),
 (1,
  '0.102*"aladdin" + 0.086*"think" + 0.066*"actor" + 0.060*"not" + 0.045*"smith" + 0.045*"good" + 0.038*"only" + 0.035*"genie" + 0.034*"great" + 0.031*"love" + 0.024*"like" + 0.024*"original" + 0.024*"disney" + 0.024*"make" + 0.023*"well" + 0.023*"character" + 0.022*"song" + 0.022*"go" + 0.019*"job" + 0.018*"new" + 0.017*"watch" + 0.016*"story" + 0.015*"cast" + 0.015*"time" + 0.014*"enjoy" + 0.014*"feel" + 0.011*"remake" + 0.011*"no" + 0.011*"version" + 0.010*"music"'),
 (2,
  '0.210*"not" + 0.203*"like" + 0.15

In [167]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}

def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_train_TFIDF_LDA, y_train, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }

MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [05:52<00:00,  1.42it/s, best loss: 0.2658697422484009]


{'loss': 0.2658697422484009,
 'params': {'C': 0.5, 'penalty': 'l2', 'solver': 'saga'},
 'status': 'ok'}

In [169]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
corpus_subtrain = X_subtrain.iloc[:,0].tolist()

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_subtrain = tfidfvec.fit_transform(corpus_subtrain)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_subtrain = matutils.Sparse2Corpus(doc_word_tfidfvec_subtrain.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_subtrain = models.LdaModel(corpus=gensim_corpus_subtrain, num_topics=14, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_subtrain = lda_subtrain[gensim_corpus_subtrain]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_subtrain = [doc for doc in lda_corpus_subtrain]

tuples_doc_topic_df = pd.DataFrame(lda_docs_subtrain,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)
tuples_doc_topic_df['topic6'] = tuples_doc_topic_df['topic6'].str[1].astype(float)
tuples_doc_topic_df['topic7'] = tuples_doc_topic_df['topic7'].str[1].astype(float)
tuples_doc_topic_df['topic8'] = tuples_doc_topic_df['topic8'].str[1].astype(float)
tuples_doc_topic_df['topic9'] = tuples_doc_topic_df['topic9'].str[1].astype(float)
tuples_doc_topic_df['topic10'] = tuples_doc_topic_df['topic10'].str[1].astype(float)
tuples_doc_topic_df['topic11'] = tuples_doc_topic_df['topic11'].str[1].astype(float)
tuples_doc_topic_df['topic12'] = tuples_doc_topic_df['topic12'].str[1].astype(float)
tuples_doc_topic_df['topic13'] = tuples_doc_topic_df['topic13'].str[1].astype(float)
tuples_doc_topic_df['topic14'] = tuples_doc_topic_df['topic14'].str[1].astype(float)

X_subtrain_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
ss = StandardScaler()
scaled_subtrain_TFIDF_LDA = ss.fit_transform(X_subtrain_TFIDF_LDA_normalized)


###VALID

corpus_valid = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_valid = tfidfvec.fit_transform(corpus_valid)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_valid = matutils.Sparse2Corpus(doc_word_tfidfvec_valid.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_valid = models.LdaModel(corpus=gensim_corpus_valid, num_topics=14, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_valid = lda_valid[gensim_corpus_valid]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_valid = [doc for doc in lda_corpus_valid]

tuples_doc_topic_df = pd.DataFrame(lda_docs_valid,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)
tuples_doc_topic_df['topic6'] = tuples_doc_topic_df['topic6'].str[1].astype(float)
tuples_doc_topic_df['topic7'] = tuples_doc_topic_df['topic7'].str[1].astype(float)
tuples_doc_topic_df['topic8'] = tuples_doc_topic_df['topic8'].str[1].astype(float)
tuples_doc_topic_df['topic9'] = tuples_doc_topic_df['topic9'].str[1].astype(float)
tuples_doc_topic_df['topic10'] = tuples_doc_topic_df['topic10'].str[1].astype(float)
tuples_doc_topic_df['topic11'] = tuples_doc_topic_df['topic11'].str[1].astype(float)
tuples_doc_topic_df['topic12'] = tuples_doc_topic_df['topic12'].str[1].astype(float)
tuples_doc_topic_df['topic13'] = tuples_doc_topic_df['topic13'].str[1].astype(float)
tuples_doc_topic_df['topic14'] = tuples_doc_topic_df['topic14'].str[1].astype(float)

X_valid_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
scaled_valid_TFIDF_LDA = ss.transform(X_valid_TFIDF_LDA_normalized)


logreg_classwt_LDA = LogisticRegression(C=0.5, penalty='l2', solver='saga', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_LDA, y_subtrain)
y_pred_classwt_LDA_subtrain = logreg_classwt_LDA.predict(scaled_subtrain_TFIDF_LDA)
y_pred_classwt_LDA_valid = logreg_classwt_LDA.predict(scaled_valid_TFIDF_LDA)

print(roc_auc_score(y_subtrain, y_pred_classwt_LDA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LDA_valid))
logreg_classwt_LDA.coef_

0.6719267987978159
0.6613266941391941


array([[ 0.02953824, -0.18467647, -0.38152271, -0.15113533, -0.132297  ,
         0.22309397, -0.34444772, -0.14276446, -0.46266649,  0.2934862 ,
         0.34885669,  0.23763806, -0.2431188 ,  0.4149495 ]])

# Random Oversampling - LSA

In [50]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

# #RANDOM Oversampling
ros = RandomOverSampler(random_state=0)

X_ros_resampled_subtrain, y_ros_resampled_subtrain = ros.fit_sample(X_subtrain,y_subtrain)
ros_corpus = pd.DataFrame(X_ros_resampled_subtrain).iloc[:,0].tolist() #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_ros = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10, n_iter=10))])
TFIDF_LSA_ros = Pipeline_TFIDF_LSA_ros.fit_transform(ros_corpus)
#Need to normalize before classifying to predict positive vs. negative sentiment
X_subtrain_TFIDF_LSA_ros_normalized = normalize(TFIDF_LSA_ros)

X_TFIDF_LSA_ros_resampled_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_ros_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_ros_resampled_subtrain = pd.DataFrame(y_ros_resampled_subtrain, columns = ['sentiment'])

scaled_X_subtrain_TFIDF_LSA_ros = StandardScaler().fit_transform(X_TFIDF_LSA_ros_resampled_subtrain)


In [52]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_subtrain_TFIDF_LSA_ros, y_ros_resampled_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [02:21<00:00,  3.53it/s, best loss: 0.21908547822784719]


{'loss': 0.21908547822784719,
 'params': {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'},
 'status': 'ok'}

In [None]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)


# #SUBTRAIN
ros = RandomOverSampler(random_state=0)

X_ros_resampled_subtrain, y_ros_resampled_subtrain = ros.fit_sample(X_subtrain,y_subtrain)
ros_corpus = pd.DataFrame(X_ros_resampled_subtrain).iloc[:,0].tolist() #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_ros = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10, n_iter=10))])
TFIDF_LSA_ros = Pipeline_TFIDF_LSA_ros.fit_transform(ros_corpus)
#Need to normalize before classifying to predict positive vs. negative sentiment
X_subtrain_TFIDF_LSA_ros_normalized = normalize(TFIDF_LSA_ros)

X_TFIDF_LSA_ros_resampled_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_ros_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_ros_resampled_subtrain = pd.DataFrame(y_ros_resampled_subtrain, columns = ['sentiment'])

ss= StandardScaler()
scaled_X_subtrain_TFIDF_LSA_ros = ss.fit_transform(X_TFIDF_LSA_ros_resampled_subtrain)


###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10))])
TFIDF_LSA_valid = Pipeline_TFIDF_LSA_valid.fit_transform(valid_corpus)

X_valid_TFIDF_LSA_normalized = normalize(TFIDF_LSA_valid)
scaled_valid_TFIDF_LSA = ss.transform(X_valid_TFIDF_LSA_normalized)

logreg_classwt_LSA = LogisticRegression(C=0.01, penalty='l2', solver='liblinear', random_state=41, max_iter=500).fit(scaled_X_subtrain_TFIDF_LSA_ros, y_ros_resampled_subtrain)
y_pred_classwt_LSA_subtrain = logreg_classwt_LSA.predict(scaled_X_subtrain_TFIDF_LSA_ros)
y_pred_classwt_LSA_valid = logreg_classwt_LSA.predict(scaled_valid_TFIDF_LSA)

print(roc_auc_score(y_ros_resampled_subtrain, y_pred_classwt_LSA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LSA_valid))
logreg_classwt_LSA.coef_

# SMOTE Oversampling - LSA

In [53]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

tvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
testing_tfidf = tvec.fit_transform(X_subtrain.iloc[:,0].tolist())

#SMOTE Oversampling
X_smoted_subtrain, y_smoted_subtrain = SMOTE(random_state=0).fit_sample(testing_tfidf,y_subtrain)
TFIDF_LSA_smote = TruncatedSVD(n_components=5,random_state=10, n_iter=10).fit_transform(X_smoted_subtrain)
X_subtrain_TFIDF_LSA_smote_normalized = normalize(TFIDF_LSA_smote)

X_TFIDF_LSA_smoted_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_smote_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_smoted_subtrain = pd.DataFrame(y_smoted_subtrain, columns = ['sentiment'])

scaled_X_subtrain_TFIDF_LSA_smoted = StandardScaler().fit_transform(X_TFIDF_LSA_smoted_subtrain)

In [54]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_subtrain_TFIDF_LSA_smoted, y_smoted_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [04:21<00:00,  1.92it/s, best loss: 0.18894382469842186]


{'loss': 0.18894382469842186,
 'params': {'C': 10, 'penalty': 'l2', 'solver': 'saga'},
 'status': 'ok'}

In [91]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

# #SUBTRAIN
tvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
testing_tfidf = tvec.fit_transform(X_subtrain.iloc[:,0].tolist())

#SMOTE Oversampling
X_smoted_subtrain, y_smoted_subtrain = SMOTE(random_state=0).fit_sample(testing_tfidf,y_subtrain)
TFIDF_LSA_smote = TruncatedSVD(n_components=5,random_state=10, n_iter=10).fit_transform(X_smoted_subtrain)
X_subtrain_TFIDF_LSA_smote_normalized = normalize(TFIDF_LSA_smote)

X_TFIDF_LSA_smoted_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_smote_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_smoted_subtrain = pd.DataFrame(y_smoted_subtrain, columns = ['sentiment'])

ss= StandardScaler()
scaled_X_subtrain_TFIDF_LSA_smoted = ss.fit_transform(X_TFIDF_LSA_smoted_subtrain)


###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10))])
TFIDF_LSA_valid = Pipeline_TFIDF_LSA_valid.fit_transform(valid_corpus)

X_valid_TFIDF_LSA_normalized = normalize(TFIDF_LSA_valid)
scaled_valid_TFIDF_LSA = ss.transform(X_valid_TFIDF_LSA_normalized)

logreg_classwt_LSA = LogisticRegression(C=10, penalty='l2', solver='saga', random_state=41, max_iter=500).fit(scaled_X_subtrain_TFIDF_LSA_smoted, y_smoted_subtrain)
y_pred_classwt_LSA_subtrain = logreg_classwt_LSA.predict(scaled_X_subtrain_TFIDF_LSA_smoted)
y_pred_classwt_LSA_valid = logreg_classwt_LSA.predict(scaled_valid_TFIDF_LSA)

print(roc_auc_score(y_smoted_subtrain, y_pred_classwt_LSA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LSA_valid))
logreg_classwt_LSA.coef_

0.7394685677252105
0.6276127518315019


array([[-0.13040146,  1.02771128,  0.1743759 , -0.48544082,  0.6308184 ]])

# ADASYN Oversampling - LSA

In [60]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

tvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
testing_tfidf = tvec.fit_transform(X_subtrain.iloc[:,0].tolist())

# #ADASYN Oversampling
X_adasyn_subtrain, y_adasyn_subtrain = ADASYN(random_state=0).fit_sample(testing_tfidf,y_subtrain)
TFIDF_LSA_adasyn = TruncatedSVD(n_components=5,random_state=10, n_iter=10).fit_transform(X_adasyn_subtrain)
X_subtrain_TFIDF_LSA_adasyn_normalized = normalize(TFIDF_LSA_adasyn)

X_TFIDF_LSA_adasyn_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_adasyn_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_adasyn_subtrain = pd.DataFrame(y_adasyn_subtrain, columns = ['sentiment'])

scaled_X_subtrain_TFIDF_LSA_adasyn = StandardScaler().fit_transform(X_TFIDF_LSA_adasyn_subtrain)

In [62]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_subtrain_TFIDF_LSA_adasyn, y_adasyn_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [01:55<00:00,  4.32it/s, best loss: 0.2174162322839449]


{'loss': 0.2174162322839449,
 'params': {'C': 5000, 'penalty': 'l2', 'solver': 'liblinear'},
 'status': 'ok'}

In [90]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

# #SUBTRAIN
tvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
testing_tfidf = tvec.fit_transform(X_subtrain.iloc[:,0].tolist())

# #ADASYN Oversampling
X_adasyn_subtrain, y_adasyn_subtrain = ADASYN(random_state=0).fit_sample(testing_tfidf,y_subtrain)
TFIDF_LSA_adasyn = TruncatedSVD(n_components=5,random_state=10, n_iter=10).fit_transform(X_adasyn_subtrain)
X_subtrain_TFIDF_LSA_adasyn_normalized = normalize(TFIDF_LSA_adasyn)

X_TFIDF_LSA_adasyn_subtrain = pd.DataFrame(X_subtrain_TFIDF_LSA_adasyn_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_LSA_adasyn_subtrain = pd.DataFrame(y_adasyn_subtrain, columns = ['sentiment'])

ss= StandardScaler()
scaled_X_subtrain_TFIDF_LSA_adasyn = ss.fit_transform(X_TFIDF_LSA_adasyn_subtrain)


###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_LSA_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('LSA', TruncatedSVD(n_components=5,random_state=10))])
TFIDF_LSA_valid = Pipeline_TFIDF_LSA_valid.fit_transform(valid_corpus)

X_valid_TFIDF_LSA_normalized = normalize(TFIDF_LSA_valid)
scaled_valid_TFIDF_LSA = ss.transform(X_valid_TFIDF_LSA_normalized)

logreg_classwt_LSA = LogisticRegression(C=5000, penalty='l2', solver='liblinear', random_state=41, max_iter=500).fit(scaled_X_subtrain_TFIDF_LSA_adasyn, y_adasyn_subtrain)
y_pred_classwt_LSA_subtrain = logreg_classwt_LSA.predict(scaled_X_subtrain_TFIDF_LSA_adasyn)
y_pred_classwt_LSA_valid = logreg_classwt_LSA.predict(scaled_valid_TFIDF_LSA)

print(roc_auc_score(y_adasyn_subtrain, y_pred_classwt_LSA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LSA_valid))
logreg_classwt_LSA.coef_

0.7235260791672052
0.6111921932234432


array([[-0.06847054,  0.95082449,  0.29788415, -0.56833238,  0.60619674]])

# HDP
Gensim also provides a Hierarchical Dirichlet Process (HDP) class [5]. HDP is similar to LDA, except it seeks to learn the correct number of topics from the data; that is, you don’t need to provide a fixed number of topics.

In [21]:
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.utils import tokenize

from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

In [None]:
corpus = X_train.iloc[:,0].tolist() 

In [44]:
flatlist = [word for word in corpus]
tokenized = tokenize(str(flatlist))
list(tokenized)

['great',
 'live',
 'remake',
 'cgi',
 'well',
 'cartoon',
 'effect',
 'stop',
 'messing',
 'classics',
 'no',
 'magic',
 'no',
 'humor',
 'remake',
 'never',
 'happen',
 'great',
 'movie',
 'time',
 'not',
 'remake',
 'compare',
 'terribly',
 'disappoint',
 'enough',
 'enough',
 'stop',
 'remakes',
 'my',
 'star',
 'cgi',
 'special',
 'effect',
 'team',
 'only',
 'despite',
 'fact',
 'animal',
 'talk',
 'lose',
 'film',
 'mirror',
 'animate',
 'version',
 'add',
 'realism',
 'enjoyable',
 'doe',
 'not',
 'deserve',
 'hate',
 'beautiful',
 'watch',
 'great',
 'cgi',
 'cute',
 'animal',
 'funny',
 'moment',
 'good',
 'voice',
 'act',
 'ask',
 'for',
 'i',
 'think',
 'write',
 'scene',
 'work',
 'well',
 'original',
 'version',
 'especially',
 'scar',
 'villain',
 'song',
 'hyena',
 'scary',
 'ted',
 'weirdo',
 'main',
 'hyena',
 'lot',
 'personality',
 'original',
 'movie',
 'wich',
 'nice',
 'favourite',
 'og',
 'movie',
 'simba',
 'timon',
 'pumbaa',
 'bachelor',
 'jungle',
 'amazing'

In [45]:
dictionary = Dictionary(list(tokenized))

In [53]:
gensim_corpus = list(tokenized)

[]

In [54]:
hdpmodel = HdpModel(corpus=gensim_corpus, id2word=dictionary)
hdpmodel.show_topics()

KeyboardInterrupt: 

# DO NOT USE

In [None]:
y_train.value_counts()
# y_train = label_binarize(y_train,classes=[0,1])
from sklearn.utils import multiclass
multiclass.type_of_target(y_train)

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
