In [34]:
import pandas as pd
import numpy as np
import sklearn
import gensim

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

#Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD #LSA
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora, models, similarities, matutils #LDA

#Clustering
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
# !conda install -c districtdatalabs yellowbrick
from yellowbrick.cluster import KElbowVisualizer

#Word Embeddings
from gensim.models import Word2Vec

#Pipeline
from sklearn.pipeline import Pipeline

#Bayes Optimization Parameter Tuner
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

#Visualization
import matplotlib.pyplot as plt
# import umap
# import hdbscan
# import sklearn.cluster as cluster


# adjectives = []
# for sent in hp.sents:
#     for word in sent:
#         if 'Harry' in word.string:
#             for child in word.children:
#                 if child.pos_ == 'ADJ': adjectives.append(child.string.strip()) #part of speech         
# Counter(adjectives).most_common(10)


# Import Data

In [35]:
full_df = pd.read_pickle('full_df_preprocessed.pkl')

In [36]:
positive_df = full_df[full_df['sentiment']=='positive']
negative_df = full_df[full_df['sentiment']=='negative']
neutral_df = full_df[full_df['sentiment']=='neutral']

# Pipeline of Vectorization, Dimensionality Reduction, and Clustering Methods

In [37]:
full_df['movie'].value_counts()

lionking          20791
aladdin           19293
beautyandbeast     4571
junglebook         3982
cinderella         2875
Name: movie, dtype: int64

In [38]:
corpus = [items[1] for items in full_df['review_processed'].iteritems()]


In [39]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """Function to display topics from Vectorizer after performing Vectorization and Dimensionality Reduction"""
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [40]:
tfidfvec = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec = tfidfvec.fit_transform(corpus)
print(doc_word_tfidfvec.shape)
# Convert sparse matrix of counts to a gensim corpus
gensim_corpus = matutils.Sparse2Corpus(doc_word_tfidfvec.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=gensim_corpus, num_topics=5, id2word=id2word, passes=5)

lda.print_topics()

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[gensim_corpus]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

# Check out the document vectors in the topic space for the first 5 documents
lda_docs[0:5]


(51512, 58)


[[(0, 0.03974516),
  (1, 0.6719355),
  (2, 0.04192431),
  (3, 0.20762521),
  (4, 0.038769882)],
 [(0, 0.033494614),
  (1, 0.6856083),
  (2, 0.03349127),
  (3, 0.2115014),
  (4, 0.03590439)],
 [(0, 0.067977145),
  (1, 0.7286146),
  (2, 0.066814244),
  (3, 0.06694729),
  (4, 0.069646716)],
 [(0, 0.20920748),
  (1, 0.4636509),
  (2, 0.043288764),
  (3, 0.24291101),
  (4, 0.040941782)],
 [(0, 0.044964857),
  (1, 0.7844885),
  (2, 0.046877317),
  (3, 0.04423541),
  (4, 0.07943385)]]

In [117]:
full_topic_doc_matrix = pd.DataFrame(lda_docs,columns=['topic0','topic1','topic2','topic3','topic4'])


full_topic_doc_matrix['topic0'] = full_topic_doc_matrix['topic0'].str[1].astype(float)
full_topic_doc_matrix['topic1'] = full_topic_doc_matrix['topic1'].str[1].astype(float)
full_topic_doc_matrix['topic2'] = full_topic_doc_matrix['topic2'].str[1].astype(float)
full_topic_doc_matrix['topic3'] = full_topic_doc_matrix['topic3'].str[1].astype(float)
full_topic_doc_matrix['topic4'] = full_topic_doc_matrix['topic4'].str[1].astype(float)

full_df['topic'] = full_topic_doc_matrix.idxmax(axis=1)
full_df


Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed,topic
0,"Disney, WHAT. HAVE. YOU. DONE Just to be clea...",lionking,imdb,1,negative,"[disney, clear, time, favorite, movie, not, st...",disney clear time favorite movie not stress en...,topic1
1,No soul. The original Lion King is one of my ...,lionking,imdb,1,negative,"[no, soul, original, lion, king, favorite, mov...",no soul original lion king favorite movie time...,topic1
5,Seriously? So anyone else notice it has a hig...,lionking,imdb,1,negative,"[seriously, notice, high, score, 7.5, rating, ...",seriously notice high score 7.5 rating not str...,topic3
6,Overrated and way too much spotlight on beyon...,lionking,imdb,1,negative,"[overrated, way, spotlight, beyonce, lion, kin...",overrated way spotlight beyonce lion king only...,topic1
8,Terrible acting!! Doesn't compare to the orig...,lionking,imdb,1,negative,"[terrible, act, not, compare, original, love, ...",terrible act not compare original love origina...,topic0
...,...,...,...,...,...,...,...,...
3040,A magically wonderful film filled with adventu...,cinderella,rottentomatoes,5,positive,"[magically, wonderful, film, fill, with, adven...",magically wonderful film fill with adventure f...,topic1
3041,Disney has overdid the faithfulness of their o...,cinderella,rottentomatoes,4,positive,"[disney, overdo, faithfulness, animate, classi...",disney overdo faithfulness animate classic pro...,topic1
3042,Magic....that's about right. A re-tell of the ...,cinderella,rottentomatoes,4,positive,"[magic, ...., right, tell, original, disney, m...",magic .... right tell original disney movie wi...,topic4
3043,A good movie that sets it apart from the origi...,cinderella,rottentomatoes,4,positive,"[good, movie, set, apart, original, story, cin...",good movie set apart original story cinderella...,topic2


# Positive Reviews

In [None]:
positive_df = full_df[full_df['sentiment']=='positive']


positive_corpus = [items[1] for items in positive_df['review_processed'].iteritems()]

tfidfvec_pos = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_pos = tfidfvec_pos.fit_transform(positive_corpus)
print(doc_word_tfidfvec_pos.shape)
# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_pos = matutils.Sparse2Corpus(doc_word_tfidfvec_pos.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec_pos.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=gensim_corpus_pos, num_topics=5, id2word=id2word, passes=5)

lda.print_topics()

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_pos = lda[gensim_corpus_pos]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_pos = [doc for doc in lda_corpus_pos]

# Check out the document vectors in the topic space for the first 5 documents
lda_docs_pos[0:5]



In [105]:
pos_topic_doc_matrix = pd.DataFrame(lda_docs_pos,columns=['topic0','topic1','topic2','topic3','topic4'])


pos_topic_doc_matrix['topic0'] = pos_topic_doc_matrix['topic0'].str[1].astype(float)
pos_topic_doc_matrix['topic1'] = pos_topic_doc_matrix['topic1'].str[1].astype(float)
pos_topic_doc_matrix['topic2'] = pos_topic_doc_matrix['topic2'].str[1].astype(float)
pos_topic_doc_matrix['topic3'] = pos_topic_doc_matrix['topic3'].str[1].astype(float)
pos_topic_doc_matrix['topic4'] = pos_topic_doc_matrix['topic4'].str[1].astype(float)

positive_df['topic'] = pos_topic_doc_matrix.idxmax(axis=1)

In [114]:
positive_df['topic'] = pos_topic_doc_matrix.idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [115]:
positive_df

Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed,topic
260,Wonderful Animal Movie 2019 Rating 9.1/10This...,lionking,imdb,9,positive,"[wonderful, animal, movie, 2019, rating, 9.1/1...",wonderful animal movie 2019 rating 9.1/10this ...,topic4
381,Don't listen to the Critics!!! This movie is ...,lionking,imdb,10,positive,"[not, listen, critics, movie, amazing, imagery...",not listen critics movie amazing imagery color...,topic4
469,Great movie The lion king is propably the bes...,lionking,imdb,7,positive,"[great, movie, lion, king, propably, well, liv...",great movie lion king propably well live actio...,topic0
470,Best remake for sure! Don't listen to any cri...,lionking,imdb,10,positive,"[best, remake, sure, not, listen, critic, peop...",best remake sure not listen critic people like...,topic0
486,Timon and Pumba save the world If I'm being h...,lionking,imdb,7,positive,"[timon, pumba, save, world, honest, well, timo...",timon pumba save world honest well timon pumba...,topic0
...,...,...,...,...,...,...,...,...
3040,A magically wonderful film filled with adventu...,cinderella,rottentomatoes,5,positive,"[magically, wonderful, film, fill, with, adven...",magically wonderful film fill with adventure f...,topic0
3041,Disney has overdid the faithfulness of their o...,cinderella,rottentomatoes,4,positive,"[disney, overdo, faithfulness, animate, classi...",disney overdo faithfulness animate classic pro...,topic0
3042,Magic....that's about right. A re-tell of the ...,cinderella,rottentomatoes,4,positive,"[magic, ...., right, tell, original, disney, m...",magic .... right tell original disney movie wi...,topic1
3043,A good movie that sets it apart from the origi...,cinderella,rottentomatoes,4,positive,"[good, movie, set, apart, original, story, cin...",good movie set apart original story cinderella...,topic4


# Negative Reviews

In [118]:
negative_df = full_df[full_df['sentiment']=='negative']

negative_corpus = [items[1] for items in negative_df['review_processed'].iteritems()]


In [119]:
tfidfvec_neg = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_neg = tfidfvec_neg.fit_transform(negative_corpus)
print(doc_word_tfidfvec_neg.shape)
# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_neg = matutils.Sparse2Corpus(doc_word_tfidfvec_neg.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec_neg.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=gensim_corpus_neg, num_topics=5, id2word=id2word, passes=5)

lda.print_topics()

# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_neg = lda[gensim_corpus_neg]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_neg = [doc for doc in lda_corpus_neg]

# Check out the document vectors in the topic space for the first 5 documents
lda_docs_neg[0:5]



(5941, 86)


[[(0, 0.03887234),
  (1, 0.03829777),
  (2, 0.4030292),
  (3, 0.03804812),
  (4, 0.4817526)],
 [(0, 0.030844288),
  (1, 0.03179345),
  (2, 0.7182067),
  (3, 0.030969629),
  (4, 0.18818597)],
 [(0, 0.067958884),
  (1, 0.35206753),
  (2, 0.06778649),
  (3, 0.06850943),
  (4, 0.4436777)],
 [(0, 0.18893284),
  (1, 0.03973202),
  (2, 0.5104033),
  (3, 0.039378364),
  (4, 0.22155349)],
 [(0, 0.81381106),
  (1, 0.064637534),
  (2, 0.03939757),
  (3, 0.041824307),
  (4, 0.04032958)]]

In [148]:
tuples_df_neg = pd.DataFrame(lda_docs_neg,columns=['topic1','topic2','topic3','topic4','topic5'])

In [149]:
tuples_df_neg['topic1'] = tuples_df_neg['topic1'].str[1].astype(float)
tuples_df_neg['topic2'] = tuples_df_neg['topic2'].str[1].astype(float)
tuples_df_neg['topic3'] = tuples_df_neg['topic3'].str[1].astype(float)
tuples_df_neg['topic4'] = tuples_df_neg['topic4'].str[1].astype(float)
tuples_df_neg['topic5'] = tuples_df_neg['topic5'].str[1].astype(float)

In [150]:
tuples_df_neg

Unnamed: 0,topic1,topic2,topic3,topic4,topic5
0,0.038872,0.038298,0.403029,0.038048,0.481753
1,0.030844,0.031793,0.718207,0.030970,0.188186
2,0.067959,0.352068,0.067786,0.068509,0.443678
3,0.188933,0.039732,0.510403,0.039378,0.221553
4,0.813811,0.064638,0.039398,0.041824,0.040330
...,...,...,...,...,...
5936,0.746404,0.063028,0.063126,0.063224,0.064218
5937,0.064560,0.064955,0.741499,0.064811,0.064175
5938,0.565213,0.053171,0.056413,0.273427,0.051777
5939,0.101585,0.592207,0.101725,0.103402,0.101081


In [121]:
neg_topic_doc_matrix = pd.DataFrame(lda_docs,columns=['topic1','topic2','topic3','topic4','topic5'])


neg_topic_doc_matrix['topic1'] = neg_topic_doc_matrix['topic1'].str[1].astype(float)
neg_topic_doc_matrix['topic2'] = neg_topic_doc_matrix['topic2'].str[1].astype(float)
neg_topic_doc_matrix['topic3'] = neg_topic_doc_matrix['topic3'].str[1].astype(float)
neg_topic_doc_matrix['topic4'] = neg_topic_doc_matrix['topic4'].str[1].astype(float)
neg_topic_doc_matrix['topic5'] = neg_topic_doc_matrix['topic5'].str[1].astype(float)

negative_df['topic'] = neg_topic_doc_matrix.idxmax(axis=1)
negative_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed,topic
0,"Disney, WHAT. HAVE. YOU. DONE Just to be clea...",lionking,imdb,1,negative,"[disney, clear, time, favorite, movie, not, st...",disney clear time favorite movie not stress en...,topic1
1,No soul. The original Lion King is one of my ...,lionking,imdb,1,negative,"[no, soul, original, lion, king, favorite, mov...",no soul original lion king favorite movie time...,topic1
5,Seriously? So anyone else notice it has a hig...,lionking,imdb,1,negative,"[seriously, notice, high, score, 7.5, rating, ...",seriously notice high score 7.5 rating not str...,topic3
6,Overrated and way too much spotlight on beyon...,lionking,imdb,1,negative,"[overrated, way, spotlight, beyonce, lion, kin...",overrated way spotlight beyonce lion king only...,topic1
8,Terrible acting!! Doesn't compare to the orig...,lionking,imdb,1,negative,"[terrible, act, not, compare, original, love, ...",terrible act not compare original love origina...,topic0
...,...,...,...,...,...,...,...,...
2954,nothing new to add. still think ever after was...,cinderella,rottentomatoes,2,negative,"[nothing, new, add, think, well, adaptation, c...",nothing new add think well adaptation cinderella,topic2
2972,Waste of time.not for kids. Only for old foks,cinderella,rottentomatoes,1,negative,"[waste, time.not, kid, only, old, foks]",waste time.not kid only old foks,topic0
2987,I feel like the real-life incarnation really b...,cinderella,rottentomatoes,2,negative,"[feel, like, real, life, incarnation, bring, h...",feel like real life incarnation bring home rid...,topic1
3023,The only performance should be mentioned and m...,cinderella,rottentomatoes,2,negative,"[only, performance, mention, mark, cate, perfo...",only performance mention mark cate performance...,topic1


# Neutral Reviews

In [122]:
neutral_df = full_df[full_df['sentiment']=='neutral']

neutral_corpus = [items[1] for items in neutral_df['review_processed'].iteritems()]



In [128]:
tfidfvec_neut = TfidfVectorizer(ngram_range=(1,5), min_df=0.05, stop_words=['movie', 'film'])
doc_word_tfidfvec_neut = tfidfvec_neut.fit_transform(negative_corpus)
print(doc_word_tfidfvec_neut.shape)
# Convert sparse matrix of counts to a gensim corpus
gensim_corpus_neut = matutils.Sparse2Corpus(doc_word_tfidfvec_neut.transpose())

#Map matrix rows to words (tokens)
#We need to save a mapping (dict) of row id to word (token) for later use by gensim:
id2word = dict((v, k) for k, v in tfidfvec_neut.vocabulary_.items())

# Create lda model (equivalent to "fit" in sklearn)
lda_neut = models.LdaModel(corpus=gensim_corpus_neut, num_topics=5, id2word=id2word, passes=5)


# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus_neut = lda[gensim_corpus_neut]


# Store the documents' topic vectors in a list so we can take a peak
lda_docs_neut = [doc for doc in lda_corpus_neut]

# Check out the document vectors in the topic space for the first 5 documents
lda_docs_neut[0:5]


(5941, 86)


[[(0, 0.33028632),
  (1, 0.038276147),
  (2, 0.038460325),
  (3, 0.5549804),
  (4, 0.037996832)],
 [(0, 0.3130868),
  (1, 0.030381668),
  (2, 0.031434707),
  (3, 0.594429),
  (4, 0.030667776)],
 [(0, 0.5396535),
  (1, 0.06744873),
  (2, 0.2554234),
  (3, 0.069422394),
  (4, 0.06805197)],
 [(0, 0.039305132),
  (1, 0.20891151),
  (2, 0.43430197),
  (3, 0.039110247),
  (4, 0.2783711)],
 [(0, 0.3521825),
  (1, 0.039799344),
  (2, 0.05374109),
  (3, 0.5144035),
  (4, 0.039873563)]]

In [124]:
neut_topic_doc_matrix = pd.DataFrame(lda_docs,columns=['topic0','topic1','topic2','topic3','topic4'])


neut_topic_doc_matrix['topic0'] = neut_topic_doc_matrix['topic0'].str[1].astype(float)
neut_topic_doc_matrix['topic1'] = neut_topic_doc_matrix['topic1'].str[1].astype(float)
neut_topic_doc_matrix['topic2'] = neut_topic_doc_matrix['topic2'].str[1].astype(float)
neut_topic_doc_matrix['topic3'] = neut_topic_doc_matrix['topic3'].str[1].astype(float)
neut_topic_doc_matrix['topic4'] = neut_topic_doc_matrix['topic4'].str[1].astype(float)

neutral_df['topic'] = neut_topic_doc_matrix.idxmax(axis=1)
neutral_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,review_text,movie,review_site,rating,sentiment,review_tokens,review_processed,topic
245,Not bad but not great. I thought the CGI was ...,lionking,imdb,5,neutral,"[not, bad, not, great, think, cgi, great, time...",not bad not great think cgi great time emotion...,topic1
282,Copyright Why they didn't yet give credit to ...,lionking,imdb,6,neutral,"[copyright, not, give, credit, kimba, white, l...",copyright not give credit kimba white lion???t...,topic1
422,The same story + CG + blandness = pointless Y...,lionking,imdb,5,neutral,"[story, cg, blandness, pointless, yes, cg, fil...",story cg blandness pointless yes cg film pheno...,topic1
423,Disappointed It was disappointed to watch it ...,lionking,imdb,5,neutral,"[disappointed, disappoint, watch, with, nothin...",disappointed disappoint watch with nothing new...,topic1
427,"no point Visually it's amazing, for the first...",lionking,imdb,5,neutral,"[no, point, visually, amaze, ﻿1, 10, minute, s...",no point visually amaze ﻿1 10 minute story son...,topic1
...,...,...,...,...,...,...,...,...
3018,With unlimited card - Sai and Akeela,cinderella,rottentomatoes,3,neutral,"[with, unlimited, card, sai, akeela]",with unlimited card sai akeela,topic1
3019,An alright movie. Though if you're looking for...,cinderella,rottentomatoes,3,neutral,"[alright, movie, look, fresh, not]",alright movie look fresh not,topic1
3022,good but not as good as malificent.,cinderella,rottentomatoes,3,neutral,"[good, not, good, malificent]",good not good malificent,topic1
3027,"A delightful, beautiful take on the classic fa...",cinderella,rottentomatoes,3,neutral,"[delightful, beautiful, classic, fairytale, ge...",delightful beautiful classic fairytale genuine...,topic1


In [130]:
lda_neut.print_topics(num_words=20)

[(0,
  '0.076*"not" + 0.039*"animate" + 0.037*"live" + 0.036*"version" + 0.035*"disney" + 0.033*"action" + 0.032*"with" + 0.031*"original" + 0.030*"live action" + 0.030*"story" + 0.022*"new" + 0.022*"like" + 0.022*"nothing" + 0.022*"old" + 0.021*"make" + 0.020*"money" + 0.019*"remake" + 0.019*"time" + 0.017*"classic" + 0.017*"good"'),
 (1,
  '0.076*"original" + 0.053*"not" + 0.052*"cartoon" + 0.044*"remake" + 0.042*"watch" + 0.032*"miss" + 0.031*"well" + 0.028*"change" + 0.028*"no" + 0.027*"make" + 0.025*"with" + 0.022*"visual" + 0.022*"voice" + 0.021*"go" + 0.021*"disney" + 0.020*"line" + 0.020*"little" + 0.019*"king" + 0.019*"like" + 0.019*"classic"'),
 (2,
  '0.075*"good" + 0.061*"not good" + 0.058*"not" + 0.052*"lion" + 0.049*"king" + 0.047*"original" + 0.045*"lion king" + 0.043*"lack" + 0.034*"emotion" + 0.033*"great" + 0.033*"animation" + 0.033*"flat" + 0.032*"fall" + 0.019*"voice" + 0.018*"feel" + 0.018*"cgi" + 0.017*"with" + 0.014*"no" + 0.014*"story" + 0.012*"like"'),
 (3,
  '

In [132]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(lda_neut["0"])

ValueError: not enough values to unpack (expected 2, got 1)

# Sentiment (Rating) Prediction

In [None]:
max_depth_range = list(range(1, 10))
n_estimators_range = list(range(50, 300))
gamma_range = list(range(0, 5))
param_dist = dict(max_depth = max_depth_range, n_estimators = n_estimators_range, gamma = gamma_range)
print(param_dist)

rand = RandomizedSearchCV(XGBClassifier(random_state=41), param_dist, cv=5, scoring='roc_auc')
rand.fit(X_ros_resampled_TRAIN, y_ros_resampled_TRAIN.values.ravel())
rand.cv_results_

print(rand.best_score_)
print(rand.best_params_)

In [None]:
#RANDOM Oversampling
#print(X_train.columns)
ros = RandomOverSampler(random_state=0)
X_ros_resampled_subtrain, y_ros_resampled_subtrain = ros.fit_sample(X_subtrain,y_subtrain)
X_ros_resampled_subtrain = pd.DataFrame(X_ros_resampled_subtrain, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
y_ros_resampled_subtrain = pd.DataFrame(y_ros_resampled_subtrain, columns = ['Purchase'])
y_ros_resampled_subtrain

#SMOTE Oversampling
X_smoted_subtrain, y_smoted_subtrain = SMOTE(random_state=0).fit_sample(X_subtrain,y_subtrain)
X_smoted_subtrain = pd.DataFrame(X_smoted_subtrain, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
y_smoted_subtrain = pd.DataFrame(y_smoted_subtrain, columns = ['Purchase'])
y_smoted_subtrain

#ADASYN Oversampling
X_adasyn_subtrain, y_adasyn_subtrain = ADASYN(random_state=0).fit_sample(X_subtrain,y_subtrain)
X_adasyn_subtrain = pd.DataFrame(X_adasyn_subtrain, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
y_adasyn_subtrain = pd.DataFrame(y_adasyn_subtrain, columns = ['Purchase'])
y_adasyn_subtrain

In [None]:
#All models - Imbalanced
Model_Results = pd.DataFrame(columns = ['Model','Fbeta1.5','Fbeta2', 'F1','Precision','Recall', 'P-R_AUC', 'ROC_AUC_train','ROC_AUC_valid','FN','FP','TN','TP','Fit','Predict','Feature_Importance'])
Model_Results['Model'] = ['Dummy_Clas','KNN_Clas_3n','KNN_Clas_5n','KNN_Clas_7n', 'KNN_Clas_3n_weightdist', 'KNN_Clas_3n_Manhattandist','GaussianNB','LogisticRegression','LogisticRegressionCV','SVC',\
                          'DecisionTree_Clas','BalRF_Clas_50est_depth2','BalRF_Clas_100est_depth2','BalRF_Clas_50est_depth3','RF_Clas_50est_depth2','RF_Clas_100est_depth2','RF_Clas_200est_depth2','RF_Clas_100est_depth3','RF_Clas_100est_depth4',\
                          'XGB_Clas_95est_depth2','XGB_Clas_Scale_95est_depth2_PosWt', 'XGB_Clas_100est_depth2','XGB_Clas_50est_depth2','XGB_Clas_95est_depth3','XGB_Clas_95est_depth4','XGB_Clas_95est_depth3_PosWt','XGB_Clas_Scale_100est_depth2_PosWt']
models = DummyClassifier(random_state=41), KNeighborsClassifier(n_neighbors=3), KNeighborsClassifier(n_neighbors=5), KNeighborsClassifier(n_neighbors=7), KNeighborsClassifier(n_neighbors=3, weights='distance'),KNeighborsClassifier(n_neighbors=3, p=1), \
        GaussianNB(), LogisticRegression(C=1.0,random_state=41, max_iter=10000), LogisticRegressionCV(Cs=[100000,10000,1000,100,10,1,0.1,0.01,0.001],random_state=41,cv=5,max_iter=10000), SVC(random_state=41,probability=True), DecisionTreeClassifier(random_state=41), \
        BalancedRandomForestClassifier(n_estimators=50, max_depth=2,random_state=41), BalancedRandomForestClassifier(n_estimators=100, max_depth=2,random_state=41), BalancedRandomForestClassifier(n_estimators=50, max_depth=3,random_state=41), RandomForestClassifier(n_estimators=50, max_depth=2,random_state=41), RandomForestClassifier(n_estimators=100, max_depth=2,random_state=41), RandomForestClassifier(n_estimators=200, max_depth=2,random_state=41), RandomForestClassifier(n_estimators=100, max_depth=3,random_state=41),RandomForestClassifier(n_estimators=100, max_depth=4,random_state=41), \
        XGBClassifier(max_depth=2,n_estimators=95,random_state=41), XGBClassifier(max_depth=2,n_estimators=95,random_state=41,scale_pos_weight=(y_subtrain.values==0).sum()/(y_subtrain.values==1).sum()), XGBClassifier(max_depth=2,n_estimators=100,random_state=41), XGBClassifier(max_depth=2,n_estimators=50,random_state=41), XGBClassifier(max_depth=3,n_estimators=95,random_state=41),XGBClassifier(max_depth=4,n_estimators=95,random_state=41), XGBClassifier(max_depth=3,n_estimators=95,random_state=41,scale_pos_weight=(y_subtrain.values==0).sum()/(y_subtrain.values==1).sum()), XGBClassifier(max_depth=2,n_estimators=100,random_state=41,scale_pos_weight=(y_subtrain.values==0).sum()/(y_subtrain.values==1).sum())

fbeta1_5_list = []
fbeta2_list = []
f1_list = []
precision_list = []
recall_list = []
PR_AUC_list = []
ROC_AUC_train_list = []
ROC_AUC_valid_list = []
FN_list = []
FP_list = []
TN_list = []
TP_list = []
Fit_list = []
Predict_list = []
Feature_Importance_list = []


#std scale train and test: recommended workflow 
#scaler.fit x-->model.fit --> scaler transform x
#model.fit, scaler transform xtest 
#model learning from data training 
#scaled test --> leaks data
#train on trainset --if input data 

#pipeline --always gets this right
#scaler.fitx
#model.fit(scaler.transform(x))
#model.predict(scaler.transform(xtest)))
#stdscale target can do, but affects betas: less interpretable
#similar impact has similar beta 

for model in models:
    
    if 'CV' in str(model): 
        scalerlogregCV = StandardScaler().fit(X_train)
        X_train_transformed = scalerlogregCV.transform(X_train.values)
        X_train_transformed = pd.DataFrame(X_train_transformed, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
        model.fit(X_train_transformed,y_train) #turn back into df
        
        fbeta1_5_list.append(fbeta_score(y_train, model.predict(X_train_transformed),1.5))
        fbeta2_list.append(fbeta_score(y_train, model.predict(X_train_transformed),2))
        f1_list.append(f1_score(y_train, model.predict(X_train_transformed)))
        precision_list.append(precision_score(y_train, model.predict(X_train_transformed)))
        recall_list.append(recall_score(y_train, model.predict(X_train_transformed)))
        precision, recall, thresholds = precision_recall_curve(y_train, model.predict(X_train_transformed)) 
        PR_AUC_list.append(auc(recall, precision)) 
        ROC_AUC_train_list.append(roc_auc_score(y_train, model.predict(X_train_transformed)))
        ROC_AUC_valid_list.append(roc_auc_score(y_train, model.predict(X_train_transformed)))
        tn, fp, fn, tp = confusion_matrix(y_train, model.predict(X_train_transformed)).ravel()
        FN_list.append(fn)
        FP_list.append(fp)
        TN_list.append(tn)
        TP_list.append(tp)
        Fit_list.append('X_train_transformed, y_train')
        Predict_list.append('X_train_transformed')
        Feature_Importance_list.append(model.coef_)
            
    elif 'LogisticRegression' in str(model):
        scalerlogreg = StandardScaler().fit(X_subtrain)
        X_subtrain_transformed = scalerlogreg.transform(X_subtrain.values)
        X_subtrain_transformed = pd.DataFrame(X_subtrain_transformed, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
        model.fit(X_subtrain_transformed,y_subtrain) #need to turn back into df
        X_valid_transformed = scalerlogreg.transform(X_valid)
        X_valid_transformed = pd.DataFrame(X_valid_transformed, columns = ['Administrative',  'Administrative_Duration',
 'Informational',  'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',  
 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Weekend',
 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
 'VisitorType_New_Visitor',  'VisitorType_Returning_Visitor', 'OperatingSystem_1', 'OperatingSystem_2',
 'OperatingSystem_3', 'OperatingSystem_4', 'OperatingSystem_6', 'OperatingSystem_7', 'OperatingSystem_8',
 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10',
 'Browser_11', 'Browser_12', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 
 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7',
 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
 'TrafficType_15', 'TrafficType_16', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20'])
        fbeta1_5_list.append(fbeta_score(y_valid, model.predict(X_valid_transformed),1.5))
        fbeta2_list.append(fbeta_score(y_valid, model.predict(X_valid_transformed),2))
        f1_list.append(f1_score(y_valid, model.predict(X_valid_transformed)))
        precision_list.append(precision_score(y_valid, model.predict(X_valid_transformed)))
        recall_list.append(recall_score(y_valid, model.predict(X_valid_transformed)))
        precision, recall, thresholds = precision_recall_curve(y_valid, model.predict(X_valid_transformed))
        PR_AUC_list.append(auc(recall, precision))
        ROC_AUC_train_list.append(roc_auc_score(y_subtrain, model.predict(X_subtrain_transformed)))
        ROC_AUC_valid_list.append(roc_auc_score(y_valid, model.predict(X_valid_transformed)))
        tn, fp, fn, tp = confusion_matrix(y_valid, model.predict(X_valid_transformed)).ravel()
        FN_list.append(fn)
        FP_list.append(fp)
        TN_list.append(tn)
        TP_list.append(tp)
        Fit_list.append('X_subtrain_transformed, y_subtrain')
        Predict_list.append('X_valid_transformed')
        Feature_Importance_list.append(model.coef_)
    
    elif 'DummyClassifier' in str(model) or 'KNeighborsClassifier' in str(model) or 'GaussianNB' in str(model) or 'SVC' in str(model):
        model.fit(X_subtrain,y_subtrain)
        fbeta1_5_list.append(fbeta_score(y_valid, model.predict(X_valid),1.5))
        fbeta2_list.append(fbeta_score(y_valid, model.predict(X_valid),2))
        f1_list.append(f1_score(y_valid, model.predict(X_valid)))
        precision_list.append(precision_score(y_valid, model.predict(X_valid)))
        recall_list.append(recall_score(y_valid, model.predict(X_valid)))
        precision, recall, thresholds = precision_recall_curve(y_valid, model.predict(X_valid))
        PR_AUC_list.append(auc(recall, precision))
        ROC_AUC_train_list.append(roc_auc_score(y_subtrain, model.predict(X_subtrain)))
        ROC_AUC_valid_list.append(roc_auc_score(y_valid, model.predict(X_valid)))
        tn, fp, fn, tp = confusion_matrix(y_valid, model.predict(X_valid)).ravel()
        FN_list.append(fn)
        FP_list.append(fp)
        TN_list.append(tn)
        TP_list.append(tp)
        Fit_list.append('X_subtrain, y_subtrain')
        Predict_list.append('X_valid')
        Feature_Importance_list.append('N/A')
        
    else:
        model.fit(X_subtrain,y_subtrain)
        fbeta1_5_list.append(fbeta_score(y_valid, model.predict(X_valid),1.5))
        fbeta2_list.append(fbeta_score(y_valid, model.predict(X_valid),2))
        f1_list.append(f1_score(y_valid, model.predict(X_valid)))
        precision_list.append(precision_score(y_valid, model.predict(X_valid)))
        recall_list.append(recall_score(y_valid, model.predict(X_valid)))
        precision, recall, thresholds = precision_recall_curve(y_valid, model.predict(X_valid))
        PR_AUC_list.append(auc(recall, precision))
        ROC_AUC_train_list.append(roc_auc_score(y_subtrain, model.predict(X_subtrain)))
        ROC_AUC_valid_list.append(roc_auc_score(y_valid, model.predict(X_valid))) #technically, decision tree models should use OOB error to identify overfitting, but can't compare to other models
        tn, fp, fn, tp = confusion_matrix(y_valid, model.predict(X_valid)).ravel()
        FN_list.append(fn)
        FP_list.append(fp)
        TN_list.append(tn)
        TP_list.append(tp)
        Fit_list.append('X_subtrain, y_subtrain')
        Predict_list.append('X_valid')
        Feature_Importance_list.append(model.feature_importances_)
        
Model_Results['Fbeta1.5'] = fbeta1_5_list
Model_Results['Fbeta2'] = fbeta2_list
Model_Results['F1'] = f1_list
Model_Results['Precision'] = precision_list
Model_Results['Recall'] = recall_list
Model_Results['P-R_AUC'] = PR_AUC_list
Model_Results['ROC_AUC_train'] = ROC_AUC_train_list
Model_Results['ROC_AUC_valid'] = ROC_AUC_valid_list   
Model_Results['FN'] = FN_list
Model_Results['FP'] = FP_list
Model_Results['TN'] = TN_list
Model_Results['TP'] = TP_list
Model_Results['Fit'] = Fit_list
Model_Results['Predict'] = Predict_list
Model_Results['Feature_Importance'] = Feature_Importance_list
Model_Results.sort_values(['FN', 'ROC_AUC_valid'], ascending=[1, 0])

In [None]:
Merge_Models = pd.concat([Model_Results, Model_Results_ROS, Model_Results_SMOTE, Model_Results_ADASYN],sort=False)
Merge_Models.sort_values(['ROC_AUC_valid','FN'], ascending=[0,1])