In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

import gensim

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

#Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD #LSA
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora, models, similarities, matutils #LDA

#Pipeline
from sklearn.pipeline import Pipeline

#Bayes Optimization Parameter Tuner
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

#Visualization
import matplotlib.pyplot as plt

#For Handling Imbalanced Data for Classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
import numpy as np

#For Classification
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize  #not needed b/c filtered out neutral ratings

Using TensorFlow backend.


In [3]:
full_df = pd.read_pickle('full_df_preprocessed.pkl')
#change sentiment to numbers as classifiers 
full_df['sentiment'].replace('positive',1,inplace=True)
full_df['sentiment'].replace('negative',0,inplace=True)
full_df = full_df[(full_df.sentiment != 'neutral')]
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(['rating','review_text','movie','review_site','review_text','sentiment', 'review_tokens'], axis=1), full_df['sentiment'], test_size=0.2, random_state=41)

y_train = y_train.astype(int)

# Tuning Number of Topics
***IMPORTANT To vectorize, reduce dimensions separately for train and test (or subtrain and valid), and normalize [for length] and standardscale [for ]. Standardscaler FITTING should ONLY BE DONE ON TRAIN, fitted Standardscaler should be used to TRANSFORM TRAIN and TEST. 

In [146]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
subtrain_corpus = X_subtrain.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

TFIDF_vec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king'])
TFIDF_subtrain_tuning = TFIDF_vec.fit_transform(subtrain_corpus)
NMF_model = NMF(n_components=4,random_state=10)
TFIDF_NMF_subtrain_tuning = NMF_model.fit_transform(TFIDF_subtrain_tuning)

X_subtrain_TFIDF_NMF_normalized_tuning= normalize(TFIDF_NMF_subtrain_tuning)
X_subtrain_TFIDF_NMF_tuning = pd.DataFrame(X_subtrain_TFIDF_NMF_normalized_tuning,columns=['topic1','topic2','topic3','topic4'])
#,'topic5','topic6','topic7','topic8','topic9','topic10','topic11'
ss = StandardScaler()
scaled_subtrain_TFIDF_NMF_tuning = ss.fit_transform(X_subtrain_TFIDF_NMF_tuning)



###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
TFIDF_valid = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king']).fit_transform(valid_corpus)
TFIDF_NMF_valid = NMF(n_components=4,random_state=10).fit_transform(TFIDF_valid)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF_tuning = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF_tuning = LogisticRegression(C=0.001, penalty='l2', solver='saga', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_NMF_tuning, y_subtrain)
y_pred_classwt_NMF_subtrain_tuning = logreg_classwt_NMF_tuning.predict(scaled_subtrain_TFIDF_NMF_tuning)
y_pred_classwt_NMF_valid_tuning = logreg_classwt_NMF_tuning.predict(scaled_valid_TFIDF_NMF_tuning)

print(roc_auc_score(y_subtrain, y_pred_classwt_NMF_subtrain_tuning))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid_tuning))
logreg_classwt_NMF_tuning.coef_

0.7123270016344335
0.7046903617216118


array([[-0.52955193,  0.52313825,  0.48850258,  0.09008075]])

In [115]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, class_weight='balanced', random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_subtrain_TFIDF_NMF_tuning, y_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [02:49<00:00,  2.96it/s, best loss: 0.22456912767289272]


{'loss': 0.22456912767289272,
 'params': {'C': 0.001, 'penalty': 'l2', 'solver': 'saga'},
 'status': 'ok'}

In [105]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix+1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [150]:
display_topics(NMF_model, TFIDF_vec.get_feature_names(), 100)


Topic  1
not, original, like, well, make, story, remake, feel, character, new, song, version, time, animate, amaze, voice, scene, lion, look, classic, little, no, animal, only, enjoy, cgi, loved, way, beautiful, cast, aladdin, real, act, actor, old, music, genie, kid, fun, smith, awesome, good, great, love

Topic  2
love, amaze, kid, smith, awesome, beautiful, old, music, song, genie, story, aladdin, lion, new, time, version, real, make, little, character, animal, fun, way, actor, cast, enjoy, animate, look, well, loved, only, feel, classic, scene, good, great, like, cgi, no, not, original, remake, voice, act

Topic  3
great, smith, loved, genie, music, cast, story, fun, awesome, classic, amaze, kid, enjoy, remake, aladdin, act, actor, song, new, beautiful, make, old, little, cgi, time, voice, version, character, animate, way, real, original, well, scene, animal, like, only, not, feel, no, love, good, look, lion

Topic  4
good, smith, genie, enjoy, music, make, aladdin, act, fun, cast

In [None]:
array([[-0.60224648,  0.68573341,  0.45763012,  0.18590289,  0.68605954,
        -0.28091036, -0.16709458,  0.18078187, -0.19779586,  0.31671876,
        -0.12705248]])

In [50]:
print(TFIDF_subtrain_tuning.shape)

#how to interp, significance of results
#came up with other questions --> next steps to push forward

#doc = just 'cgi--> normlaize, standardize, test steps (as a doc)'

(28541, 56)


In [113]:
TFIDF_vec.get_feature_names()

['act',
 'actor',
 'aladdin',
 'amaze',
 'animal',
 'animate',
 'awesome',
 'beautiful',
 'cast',
 'cgi',
 'character',
 'classic',
 'enjoy',
 'feel',
 'fun',
 'genie',
 'good',
 'great',
 'kid',
 'king',
 'like',
 'lion',
 'little',
 'look',
 'love',
 'loved',
 'make',
 'music',
 'new',
 'no',
 'not',
 'old',
 'only',
 'original',
 'real',
 'remake',
 'scene',
 'smith',
 'song',
 'story',
 'time',
 'version',
 'voice',
 'way',
 'well']

In [121]:
X_subtrain['topic'] = X_subtrain_TFIDF_NMF_tuning.idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [92]:
y_subtrain

1639     1
3322     1
10389    0
6261     1
1766     1
        ..
5833     1
5494     1
518      1
2753     1
14786    1
Name: sentiment, Length: 28541, dtype: int64

In [122]:
X_subtrain

Unnamed: 0,review_processed,topic
1639,amazing movie no complaint round applause,topic4
3322,love took childhood,topic1
10389,adds little previous version cinematography bl...,topic1
6261,excellent love new song plus original song gen...,topic2
1766,not bad live action disney movie come life not...,topic4
...,...,...
5833,think movie great action digital effect real r...,topic1
5494,movie great,topic3
518,prince ali fabulous ali ababwa song stick head...,topic1
2753,loved movie,topic1


Unnamed: 0,review_processed,topic,sentiment
0,disney clear time favorite movie not stress en...,topic1,0
0,disney clear time favorite movie not stress en...,topic1,1
0,perfect blend spectacle music,topic1,0
0,perfect blend spectacle music,topic1,1
0,hate change line original feel wrong level,topic1,0
...,...,...,...
20145,life lesson not want,topic4,1
20146,grow watch lion king amaze amaze people movie ...,topic2,1
20147,far easy,topic1,1
20148,definitely not nearly superior original origin...,topic3,1


# Testing Best Model on Test Set

In [346]:
###TRAIN
train_corpus = X_train.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

TFIDF_vec_final = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king'])
TFIDF_train = TFIDF_vec_final.fit_transform(train_corpus)
NMF_model_final = NMF(n_components=4,random_state=10)
TFIDF_NMF_train = NMF_model_final.fit_transform(TFIDF_train)

X_train_TFIDF_NMF_normalized = normalize(TFIDF_NMF_train)
X_train_TFIDF_NMF = pd.DataFrame(X_train_TFIDF_NMF_normalized,columns=['topic1','topic2','topic3','topic4'])
#,'topic5','topic6','topic7','topic8','topic9','topic10','topic11'
ss = StandardScaler()
scaled_train_TFIDF_NMF = ss.fit_transform(X_train_TFIDF_NMF)



###TEST

y_test = y_test.astype(int)

test_corpus = X_test.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
TFIDF_test = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king']).fit_transform(test_corpus)
TFIDF_NMF_test = NMF(n_components=4,random_state=10).fit_transform(TFIDF_test)

X_test_TFIDF_NMF_normalized = normalize(TFIDF_NMF_test)
scaled_test_TFIDF_NMF = ss.transform(X_test_TFIDF_NMF_normalized)

logreg_classwt_NMF_test = LogisticRegression(C=0.001, penalty='l2', solver='saga', class_weight='balanced', random_state=41, max_iter=500)
logreg_classwt_NMF_test.fit(scaled_train_TFIDF_NMF, y_train)
y_pred_classwt_NMF_train = logreg_classwt_NMF_test.predict(scaled_train_TFIDF_NMF)
y_pred_classwt_NMF_test = logreg_classwt_NMF_test.predict(scaled_test_TFIDF_NMF)

print(roc_auc_score(y_train, y_pred_classwt_NMF_train))
print(roc_auc_score(y_test, y_pred_classwt_NMF_test))
logreg_classwt_NMF_test.coef_

0.7136753413766308
0.7178528874345186


array([[-0.53965851,  0.54692323,  0.50071603,  0.09383977]])

In [181]:
display_topics(NMF_model_final, TFIDF_vec_final.get_feature_names(), 30)


Topic  1
not, original, like, well, make, remake, story, feel, character, new, version, song, time, animate, voice, amaze, scene, look, lion, classic, no, little, animal, only, enjoy, cgi, way, cast, beautiful, real

Topic  2
love, amaze, kid, smith, awesome, beautiful, old, music, song, genie, story, lion, aladdin, time, new, version, make, real, character, animal, little, fun, actor, loved, way, cast, enjoy, animate, look, well

Topic  3
great, smith, loved, genie, music, cast, story, fun, awesome, amaze, classic, kid, enjoy, remake, aladdin, act, actor, song, make, new, beautiful, little, old, character, voice, time, cgi, animate, version, well

Topic  4
good, smith, genie, enjoy, music, aladdin, make, act, story, fun, actor, cast, version, kid, song, little, animate, awesome, cgi, character, classic, old, amaze, scene, feel, time, only, voice, beautiful, look


In [151]:
y_test

8255     1
4326     1
12260    1
16543    1
896      1
        ..
1109     1
683      0
10723    1
17324    0
2488     1
Name: sentiment, Length: 8920, dtype: int64

In [169]:
X_test['topic'] = pd.DataFrame(scaled_test_TFIDF_NMF,columns=['topic1','topic2','topic3','topic4']).idxmax(axis=1)

In [170]:
X_test.isna().sum()

review_processed       0
topic               3661
dtype: int64

In [171]:
X_train['topic'] = X_train_TFIDF_NMF.idxmax(axis=1)

In [172]:
X_train.isna().sum()

review_processed    0
topic               0
dtype: int64

In [173]:
X_test.to_csv('X_test_final.csv')
y_test.to_csv('y_test_final.csv')
X_train.to_csv('X_train_final.csv')
y_train.to_csv('y_train_final.csv')

  
  after removing the cwd from sys.path.


In [174]:
full_df.to_csv('full_df.csv')

In [175]:
full_df

Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed
0,"Disney, WHAT. HAVE. YOU. DONE Just to be clea...",lionking,imdb,1,0,"[disney, clear, time, favorite, movie, not, st...",disney clear time favorite movie not stress en...
1,No soul. The original Lion King is one of my ...,lionking,imdb,1,0,"[no, soul, original, lion, king, favorite, mov...",no soul original lion king favorite movie time...
5,Seriously? So anyone else notice it has a hig...,lionking,imdb,1,0,"[seriously, notice, high, score, 7.5, rating, ...",seriously notice high score 7.5 rating not str...
6,Overrated and way too much spotlight on beyon...,lionking,imdb,1,0,"[overrated, way, spotlight, beyonce, lion, kin...",overrated way spotlight beyonce lion king only...
8,Terrible acting!! Doesn't compare to the orig...,lionking,imdb,1,0,"[terrible, act, not, compare, original, love, ...",terrible act not compare original love origina...
...,...,...,...,...,...,...,...
3040,A magically wonderful film filled with adventu...,cinderella,rottentomatoes,5,1,"[magically, wonderful, film, fill, adventure, ...",magically wonderful film fill adventure fantas...
3041,Disney has overdid the faithfulness of their o...,cinderella,rottentomatoes,4,1,"[disney, overdo, faithfulness, animate, classi...",disney overdo faithfulness animate classic pro...
3042,Magic....that's about right. A re-tell of the ...,cinderella,rottentomatoes,4,1,"[magic, ...., right, tell, original, disney, m...",magic .... right tell original disney movie li...
3043,A good movie that sets it apart from the origi...,cinderella,rottentomatoes,4,1,"[good, movie, set, apart, original, story, cin...",good movie set apart original story cinderella...


In [179]:
y_train.value_counts()

1    30928
0     4749
Name: sentiment, dtype: int64

In [177]:
X_test

Unnamed: 0,review_processed,topic
8255,loved boy enjoy,topic3
4326,movie great voice great cgi good not perfect,topic2
12260,truly enjoy version lion king,
16543,movie especially og 90s enjoy not compare cont...,
896,impress movie not technical aspect good execut...,topic4
...,...,...
1109,special film level,topic2
683,pretty like wrong movie,topic2
10723,absolutely love lion king,
17324,smith great effect great act okay not apprecia...,


In [347]:
import matplotlib.pyplot as plt
import mglearn

You provided "cachedir='cache'", use "location='cache'" instead.
  memory = Memory(cachedir="cache")
You provided "cachedir='cache'", use "location='cache'" instead.
  memory = Memory(cachedir="cache")


In [356]:

mglearn.tools.visualize_coefficients(logreg_classwt_NMF_test.coef_, TFIDF_vec_final.get_feature_names(), n_top_features=4)
plt.show()

ValueError: Number of coefficients 4 doesn't match number offeature names 44.

# Testing Model with "New" (One-word) Reviews

In [314]:
word_test_review = ['regular']
word_test = pd.DataFrame(word_test_review)

In [315]:


###TEST


word_corpus = word_test.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
TFIDF_word = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king']).fit_transform(word_corpus)
TFIDF_NMF_word = NMF(n_components=4,random_state=10).fit_transform(TFIDF_word)

word_TFIDF_NMF_normalized = normalize(TFIDF_NMF_word)
scaled_word_TFIDF_NMF = ss.transform(word_TFIDF_NMF_normalized)

y_pred_classwt_NMF_word = logreg_classwt_NMF_test.predict(scaled_word_TFIDF_NMF)

print(y_pred_classwt_NMF_word)
print(roc_auc_score(y_test, y_pred_classwt_NMF_test))
logreg_classwt_NMF_test.coef_

[0]
0.7178528874345186


array([[-0.53965851,  0.54692323,  0.50071603,  0.09383977]])

In [None]:
###TRAIN
train_corpus = X_train.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

TFIDF_vec_final = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film','live','action','watch','think','go','job','disney','king'])
TFIDF_train = TFIDF_vec_final.fit_transform(train_corpus)
NMF_model_final = NMF(n_components=4,random_state=10)
TFIDF_NMF_train = NMF_model_final.fit_transform(TFIDF_train)

X_train_TFIDF_NMF_normalized = normalize(TFIDF_NMF_train)
X_train_TFIDF_NMF = pd.DataFrame(X_train_TFIDF_NMF_normalized,columns=['topic1','topic2','topic3','topic4'])
#,'topic5','topic6','topic7','topic8','topic9','topic10','topic11'
ss = StandardScaler()
scaled_train_TFIDF_NMF = ss.fit_transform(X_train_TFIDF_NMF)




# Re-trying ROS (without Class Weight = Balanced)

In [336]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)


# #SUBTRAIN
ros = RandomOverSampler(random_state=0)

X_ros_resampled_subtrain, y_ros_resampled_subtrain = ros.fit_sample(X_subtrain,y_subtrain)
ros_corpus = pd.DataFrame(X_ros_resampled_subtrain).iloc[:,0].tolist() #convert from dataframe to series, to transform into list of reviews
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
tfidf_doc_words = tfidf_vec.fit_transform(ros_corpus)
NMF_topics = NMF(n_components=5,random_state=10)
TFIDF_NMF_ros = NMF_topics.fit_transform(tfidf_doc_words)
#Need to normalize before classifying to predict positive vs. negative sentiment
X_subtrain_TFIDF_NMF_ros_normalized = normalize(TFIDF_NMF_ros)

X_TFIDF_NMF_ros_resampled_subtrain = pd.DataFrame(X_subtrain_TFIDF_NMF_ros_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_NMF_ros_resampled_subtrain = pd.DataFrame(y_ros_resampled_subtrain, columns = ['sentiment'])

ss= StandardScaler()
scaled_X_subtrain_TFIDF_NMF_ros = ss.fit_transform(X_TFIDF_NMF_ros_resampled_subtrain)


###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=5,random_state=10))])
TFIDF_NMF_valid = Pipeline_TFIDF_NMF_valid.fit_transform(valid_corpus)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF = LogisticRegression(C=0.05, penalty='l2', solver='liblinear', random_state=41, max_iter=500).fit(scaled_X_subtrain_TFIDF_NMF_ros, y_ros_resampled_subtrain)
y_pred_classwt_NMF_subtrain = logreg_classwt_NMF.predict(scaled_X_subtrain_TFIDF_NMF_ros)
y_pred_classwt_NMF_valid = logreg_classwt_NMF.predict(scaled_valid_TFIDF_NMF)

print(roc_auc_score(y_ros_resampled_subtrain, y_pred_classwt_NMF_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid))
logreg_classwt_NMF.coef_

0.7009680816591057
0.6867416437728938


array([[-0.60199631, -0.0007947 ,  0.4024587 ,  0.53056368, -0.35693496]])

In [337]:
display_topics(NMF_topics, tfidf_vec.get_feature_names(), 30)


Topic  1
not, good, like, film, feel, make, disney, character, story, voice, song, no, look, time, version, live, go, remake, watch, animate, act, think, scene, action, animal, only, cgi, live action, bad, little

Topic  2
movie, good, enjoy, amaze, well, watch, disney, beautiful, make, time, kid, go, animate, like, old, give, want, bad, disappoint, thing, only, way, think, scene, no, come, little, nothing, real, act

Topic  3
great, smith, genie, job, music, cast, story, aladdin, amaze, remake, classic, good, actor, song, act, enjoy, singe, kid, new, think, animation, cgi, action, cartoon, make, little, live, disney, version, old

Topic  4
love, amaze, kid, story, song, smith, music, old, new, beautiful, go, version, genie, king, aladdin, think, lion king, lion, job, watch, disney, real, animal, singe, cartoon, time, add, want, cast, little

Topic  5
original, well, remake, watch, new, lack, lion, king, lion king, emotion, nothing, no, voice, song, version, add, like, animation, scen

In [324]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_subtrain_TFIDF_NMF_ros, y_ros_resampled_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [02:07<00:00,  3.91it/s, best loss: 0.24201968977730548]


{'loss': 0.24201968977730548,
 'params': {'C': 0.05, 'penalty': 'l2', 'solver': 'liblinear'},
 'status': 'ok'}

# Re-Trying SMOTE

In [344]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

# #SUBTRAIN
tvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05)
testing_tfidf = tvec.fit_transform(X_subtrain.iloc[:,0].tolist())

#SMOTE Oversampling
X_smoted_subtrain, y_smoted_subtrain = SMOTE(random_state=0).fit_sample(testing_tfidf,y_subtrain)
NMF_smote = NMF(n_components=5,random_state=10)
TFIDF_NMF_smote = NMF_smote.fit_transform(X_smoted_subtrain)
X_subtrain_TFIDF_NMF_smote_normalized = normalize(TFIDF_NMF_smote)

X_TFIDF_NMF_smoted_subtrain = pd.DataFrame(X_subtrain_TFIDF_NMF_smote_normalized, columns = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5'])
y_TFIDF_NMF_smoted_subtrain = pd.DataFrame(y_smoted_subtrain, columns = ['sentiment'])

ss= StandardScaler()
scaled_X_subtrain_TFIDF_NMF_smoted = ss.fit_transform(X_TFIDF_NMF_smoted_subtrain)


###VALID

valid_corpus = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews
Pipeline_TFIDF_NMF_valid = Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,5), min_df=0.05)),
                ('NMF', NMF(n_components=5,random_state=10))])
TFIDF_NMF_valid = Pipeline_TFIDF_NMF_valid.fit_transform(valid_corpus)

X_valid_TFIDF_NMF_normalized = normalize(TFIDF_NMF_valid)
scaled_valid_TFIDF_NMF = ss.transform(X_valid_TFIDF_NMF_normalized)

logreg_classwt_NMF = LogisticRegression(C=0.001, penalty='l1', solver='liblinear', random_state=41, max_iter=500).fit(scaled_X_subtrain_TFIDF_NMF_smoted, y_smoted_subtrain)
y_pred_classwt_NMF_subtrain = logreg_classwt_NMF.predict(scaled_X_subtrain_TFIDF_NMF_smoted)
y_pred_classwt_NMF_valid = logreg_classwt_NMF.predict(scaled_valid_TFIDF_NMF)

print(roc_auc_score(y_smoted_subtrain, y_pred_classwt_NMF_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_NMF_valid))
logreg_classwt_NMF.coef_

0.7053426766040182
0.5869476877289378


array([[-0.57157538,  0.45251235, -0.11680827, -0.11345774,  0.27151169]])

In [341]:
space = {'C': hp.choice('C', [0.001,0.05, 0.01,0.1,0.5,1,5, 10,50,100,500,1000,5000, 10000]),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'penalty': hp.choice('penalty', ['l1','l2'])}


def objective(params):
    """Objective function for Logistic Regerssion Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    
    model = LogisticRegression(**params, random_state=41, verbose=0,max_iter=500)
    
    best_score = cross_val_score(model, scaled_X_subtrain_TFIDF_NMF_smoted, y_smoted_subtrain, cv=5, scoring='roc_auc').mean()
    
    loss = 1 - best_score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }


MAX_EVALS = 500
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

100%|██████████| 500/500 [02:11<00:00,  3.79it/s, best loss: 0.22690399557119623]


{'loss': 0.22690399557119623,
 'params': {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'},
 'status': 'ok'}

In [345]:
display_topics(NMF_smote, tvec.get_feature_names(), 30)


Topic  1
not, like, think, go, feel, voice, song, time, look, movie, character, watch, only, kid, scene, animal, actor, little, make, story, cgi, aladdin, act, way, old, cast, real, music, enjoy, version

Topic  2
movie, love, great, great movie, amaze, loved, enjoy, smith, awesome, kid, watch, beautiful, go, music, well, old, time, disney, job, fun, make, genie, aladdin, think, only, animate, like, way, little, song

Topic  3
original, well, new, watch, remake, song, scene, loved, lion, version, king, way, story, music, time, voice, little, fun, amaze, like, go, cgi, cast, no, think, awesome, love, beautiful, feel, animate

Topic  4
film, like, disney, make, remake, feel, no, character, animate, live, story, voice, version, action, lion, king, live action, well, classic, watch, look, act, time, new, scene, song, animal, cgi, little, only

Topic  5
good, smith, genie, great, job, aladdin, music, story, cast, think, enjoy, song, actor, fun, loved, amaze, act, little, version, awesome, 

# -----DO NOT USE------

In [327]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

#There's a cool LDA visualization available, but I did not end up using LDA... also it needs a corpus and a "dictionary"--which I couldn't get to work

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.utils import tokenize

from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

flatlist = [word for word in corpus_subtrain]
tokenized = tokenize(str(flatlist))
list(tokenized)

gensim_corpus = list(tokenized)
dictionary = Dictionary(list(tokenized))

pyLDAvis.gensim.prepare(lda_subtrain,gensim_corpus_subtrain, ___DICTIONARY)

In [328]:
X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

###SUBTRAIN
corpus_subtrain = X_subtrain.iloc[:,0].tolist()

tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_subtrain = tfidfvec.fit_transform(corpus_subtrain)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_subtrain = matutils.Sparse2Corpus(doc_word_tfidfvec_subtrain.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_subtrain = models.LdaModel(corpus=gensim_corpus_subtrain, num_topics=14, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_subtrain = lda_subtrain[gensim_corpus_subtrain]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_subtrain = [doc for doc in lda_corpus_subtrain]

tuples_doc_topic_df = pd.DataFrame(lda_docs_subtrain,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)
tuples_doc_topic_df['topic6'] = tuples_doc_topic_df['topic6'].str[1].astype(float)
tuples_doc_topic_df['topic7'] = tuples_doc_topic_df['topic7'].str[1].astype(float)
tuples_doc_topic_df['topic8'] = tuples_doc_topic_df['topic8'].str[1].astype(float)
tuples_doc_topic_df['topic9'] = tuples_doc_topic_df['topic9'].str[1].astype(float)
tuples_doc_topic_df['topic10'] = tuples_doc_topic_df['topic10'].str[1].astype(float)
tuples_doc_topic_df['topic11'] = tuples_doc_topic_df['topic11'].str[1].astype(float)
tuples_doc_topic_df['topic12'] = tuples_doc_topic_df['topic12'].str[1].astype(float)
tuples_doc_topic_df['topic13'] = tuples_doc_topic_df['topic13'].str[1].astype(float)
tuples_doc_topic_df['topic14'] = tuples_doc_topic_df['topic14'].str[1].astype(float)

X_subtrain_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
ss = StandardScaler()
scaled_subtrain_TFIDF_LDA = ss.fit_transform(X_subtrain_TFIDF_LDA_normalized)


###VALID

corpus_valid = X_valid.iloc[:,0].tolist()  #convert from dataframe to series, to transform into list of reviews

tfidfvec = TfidfVectorizer(ngram_range=(1,5), stop_words=['movie', 'film'])
doc_word_tfidfvec_valid = tfidfvec.fit_transform(corpus_valid)

# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_valid = matutils.Sparse2Corpus(doc_word_tfidfvec_valid.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_valid = models.LdaModel(corpus=gensim_corpus_valid, num_topics=14, id2word=id2word, passes=5)

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_valid = lda_valid[gensim_corpus_valid]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_valid = [doc for doc in lda_corpus_valid]

tuples_doc_topic_df = pd.DataFrame(lda_docs_valid,columns=['topic1','topic2','topic3','topic4','topic5','topic6','topic7','topic8','topic9','topic10','topic11','topic12','topic13','topic14'])
tuples_doc_topic_df['topic1'] = tuples_doc_topic_df['topic1'].str[1].astype(float)
tuples_doc_topic_df['topic2'] = tuples_doc_topic_df['topic2'].str[1].astype(float)
tuples_doc_topic_df['topic3'] = tuples_doc_topic_df['topic3'].str[1].astype(float)
tuples_doc_topic_df['topic4'] = tuples_doc_topic_df['topic4'].str[1].astype(float)
tuples_doc_topic_df['topic5'] = tuples_doc_topic_df['topic5'].str[1].astype(float)
tuples_doc_topic_df['topic6'] = tuples_doc_topic_df['topic6'].str[1].astype(float)
tuples_doc_topic_df['topic7'] = tuples_doc_topic_df['topic7'].str[1].astype(float)
tuples_doc_topic_df['topic8'] = tuples_doc_topic_df['topic8'].str[1].astype(float)
tuples_doc_topic_df['topic9'] = tuples_doc_topic_df['topic9'].str[1].astype(float)
tuples_doc_topic_df['topic10'] = tuples_doc_topic_df['topic10'].str[1].astype(float)
tuples_doc_topic_df['topic11'] = tuples_doc_topic_df['topic11'].str[1].astype(float)
tuples_doc_topic_df['topic12'] = tuples_doc_topic_df['topic12'].str[1].astype(float)
tuples_doc_topic_df['topic13'] = tuples_doc_topic_df['topic13'].str[1].astype(float)
tuples_doc_topic_df['topic14'] = tuples_doc_topic_df['topic14'].str[1].astype(float)

X_valid_TFIDF_LDA_normalized = normalize(tuples_doc_topic_df)
scaled_valid_TFIDF_LDA = ss.transform(X_valid_TFIDF_LDA_normalized)


logreg_classwt_LDA = LogisticRegression(C=0.5, penalty='l2', solver='saga', class_weight='balanced', random_state=41, max_iter=500).fit(scaled_subtrain_TFIDF_LDA, y_subtrain)
y_pred_classwt_LDA_subtrain = logreg_classwt_LDA.predict(scaled_subtrain_TFIDF_LDA)
y_pred_classwt_LDA_valid = logreg_classwt_LDA.predict(scaled_valid_TFIDF_LDA)

print(roc_auc_score(y_subtrain, y_pred_classwt_LDA_subtrain))
print(roc_auc_score(y_valid, y_pred_classwt_LDA_valid))
logreg_classwt_LDA.coef_

0.6680918480335074
0.4983974358974359


array([[-0.17658163, -0.31929957,  0.41328625, -0.07867729,  0.33119805,
        -0.19865915,  0.08573398, -0.04827216, -0.59259475,  0.13543375,
        -0.25661444,  0.37652611, -0.15059132, -0.33822592]])

In [330]:
print(len(gensim_corpus))

0


In [329]:
id2word

{48023: 'best',
 2261: '2019',
 48396: 'better',
 124388: 'endgame',
 282811: 'no',
 114000: 'doubt',
 48024: 'best 2019',
 2306: '2019 better',
 48420: 'better endgame',
 124389: 'endgame no',
 283088: 'no doubt',
 48025: 'best 2019 better',
 2307: '2019 better endgame',
 48421: 'better endgame no',
 124390: 'endgame no doubt',
 48026: 'best 2019 better endgame',
 2308: '2019 better endgame no',
 48422: 'better endgame no doubt',
 48027: 'best 2019 better endgame no',
 2309: '2019 better endgame no doubt',
 4616: 'absolutely',
 244064: 'love',
 158204: 'fresh',
 279213: 'new',
 380967: 'song',
 170585: 'good',
 78749: 'classic',
 377432: 'smith',
 192048: 'hold',
 163107: 'genie',
 424580: 'twist',
 47467: 'beloved',
 69524: 'character',
 325629: 'pleasantly',
 401507: 'surprise',
 4988: 'absolutely love',
 245388: 'love fresh',
 158354: 'fresh new',
 280899: 'new song',
 381942: 'song good',
 171477: 'good classic',
 79759: 'classic love',
 247402: 'love smith',
 378709: 'smith hold'