In [1]:
%config IPCompleter.greedy=True

In [1]:
##Importing all the necessary directories

import pandas as pd
import numpy as np
import re

import gensim
from gensim import models
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.utils import lemmatize
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import regexp_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [193]:
# open pickle df
with open('df2.pkl', 'rb') as pkl_file:
    df = pickle.load(pkl_file) 
df.drop(['index'], axis =1, inplace = True)
df.head()# Pickle file for later use

Unnamed: 0,game,steam_purchase,received_for_free,written_during_early_access,voted_up,review,clean_words,clean_text
0,80360,True,False,False,True,Things are really heating up :),"[thing, heating]",thing heating
1,80360,True,False,False,True,Pure awesomeness! The soundtrack is so good in...,"[pure, awesomeness, soundtrack, good, support,...",pure awesomeness soundtrack good support perfe...
2,80360,True,False,False,True,As the other parts of the Blackwell series: I ...,"[blackwell, series, love, game, investigation,...",blackwell series love game investigation funny...
3,80360,True,False,False,True,The art style has completely changed from the ...,"[art, style, completely, change, game, bit, pi...",art style completely change game bit pixel art...
4,80360,True,False,False,True,If you haven't played the other three adventur...,"[haven, played, adventure, blackwell, series, ...",haven played adventure blackwell series play g...


In [92]:
my_dict = Dictionary(df.clean_words)
#my_dict.filter_extremes(no_below=5, no_above=0.90)

# Count Vectorization
dtm = [my_dict.doc2bow(doc) for doc in df.clean_words]

# TF-IDF Vectorization
tfidf_vectorizer = TfidfModel(dtm)
tfidf = tfidf_vectorizer[dtm]


In [93]:
from gensim.models import LsiModel, CoherenceModel, LdaModel


lsi_tfidf = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=5)
lda_tfidf = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=5)

In [94]:
# Get dominant topic and corresponding keywords for each article

def getKeywordsFromDominantTopic(model, corpus, texts): 
    # Init output
    topickeyword_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topickeyword_df = topickeyword_df.append(pd.Series([topic_keywords]), ignore_index=True)
            else:
                break
    return(topickeyword_df)

In [95]:
df['LSI TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lsi_tfidf, corpus=tfidf, texts=df.review)
df['LDA TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lda_tfidf, corpus=tfidf, texts=df.review)

In [96]:
# Combine all the keywords
df['merged-keywords'] = df['LSI TF-IDF Keywords'] + ', ' + df['LDA TF-IDF Keywords'] 

In [124]:
# Get 5 most common keywords across the four groups of keywords
from collections import Counter 
for i in range(len(df)):
    words = df.loc[i, 'merged-keywords']
    most_occur = Counter(str(words).split()).most_common(5) 
    df.loc[i, 'Top 5 Freq Words'] = ' '.join([word[0] for word in most_occur])

df[['review', 'Top 5 Freq Words']].head()

Unnamed: 0,review,Top 5 Freq Words
0,Things are really heating up :),"trine, great, fun, server, bad,"
1,Pure awesomeness! The soundtrack is so good in...,"good, time, great, best, love,"
2,As the other parts of the Blackwell series: I ...,"bad, great, best, fun, adventure,"
3,The art style has completely changed from the ...,"good, time, great, best, love,"
4,If you haven't played the other three adventur...,"good, time, great, best, love,"


In [132]:
lda_tfidf.print_topics()

[(0,
  '0.003*"fukin" + 0.003*"wtf" + 0.003*"funnest" + 0.003*"bully" + 0.003*"nahuya" + 0.003*"gamesense" + 0.003*"install" + 0.003*"realy" + 0.003*"classic" + 0.002*"instruction"'),
 (1,
  '0.023*"good" + 0.012*"suck" + 0.012*"trash" + 0.007*"best" + 0.007*"game" + 0.006*"love" + 0.005*"garbage" + 0.005*"lag" + 0.003*"great" + 0.003*"adventure"'),
 (2,
  '0.014*"tutorial" + 0.008*"bad" + 0.007*"game" + 0.006*"boring" + 0.006*"stay" + 0.005*"good" + 0.005*"play" + 0.005*"like" + 0.005*"time" + 0.004*"need"'),
 (3,
  '0.018*"shit" + 0.009*"fuck" + 0.007*"nice" + 0.007*"pile" + 0.005*"fun" + 0.005*"dog" + 0.004*"bruh" + 0.004*"bcs" + 0.003*"wheres" + 0.003*"eeeeeeeeeeeeeeehhhh"'),
 (4,
  '0.010*"cent" + 0.006*"great" + 0.005*"horror" + 0.004*"didnt" + 0.004*"meme" + 0.004*"simulator" + 0.004*"disaster" + 0.003*"hey" + 0.003*"crap" + 0.003*"female"')]

In [81]:
from gensim.models import Word2Vec
num_features = 100
model = Word2Vec(df['clean_words'], size=num_features, window=8, min_count=2, 
    sample=1e-3, sg=1, iter=5, workers=8)
vocab = set(model.wv.index2word)
len(vocab)

9474

In [235]:
def get_sim(word1, sent):
    sent_sim = 0
    nwords = 0
    for word2 in sent:
        if(word2 in vocab) and (model.wv.similarity(w1 = word1, w2 = word2) >0.5 or model.wv.similarity(w1 = word1, w2 = word2) < -0.5):
            sent_sim = sent_sim + model.wv.similarity(w1 = word1, w2 = word2)
            nwords = nwords + 1
        if nwords == 0:
            return 0
        else:
            return round(sent_sim/nwords,3)

# Function to extract features from text data using the custom similarity function (get_sim)
# The features range between (-1 to 1)
def feature_extraction(df):
    df = df.reset_index()
    for i in range(len(df)):
        df.loc[i,"gameplay_sim"] = get_sim("gameplay",df.loc[i,"clean_words"])
        df.loc[i,"sound_design_sim"] = get_sim("music",df.loc[i,"clean_words"])
        df.loc[i,"ambience_sim"] = get_sim("ambience",df.loc[i,"clean_words"])
        df.loc[i,"story_sim"] = get_sim("story",df.loc[i,"clean_words"])
    return df

In [329]:
x2 = feature_extraction(df['clean_words']) 
for i in range(len(df)):
    x2.loc[i,"sentiment_polarity"] = vader.polarity_scores(df.loc[i,"review"])['compound']
x2["voted_up"] = df["voted_up"]
x2.dropna(how = 'any',inplace = True)
x2.shape
x2.drop(["index","clean_words"], axis = 1,inplace = True)
x2.head()

Unnamed: 0,gameplay_sim,sound_design_sim,ambience_sim,story_sim,sentiment_polarity,voted_up
0,0.525,0.0,0.504,0.0,0.5046,True
1,0.0,0.0,0.744,0.505,0.9029,True
2,0.0,0.0,0.0,0.51,0.8689,True
3,0.0,0.752,0.721,0.0,0.904,True
4,0.0,0.0,0.541,0.0,0.6872,True


“Valence Aware Dictionary and sEntiment Reasoner” is another popular rule-based library for sentiment analysis. Like TextBlob, it uses a sentiment lexicon that contains intensity measures for each word based on human-annotated labels. A key difference however, is that VADER was designed with a focus on social media texts. This means that it puts a lot of emphasis on rules that capture the essence of text typically seen on social media — for example, short sentences with emojis, repetitive vocabulary and copious use of punctuation (such as exclamation marks). Below are some examples of the sentiment intensity scores output by VADER.

In [337]:
# Classification model function 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
def classification_model(classifier, avg_train_features, train_y):
    classifier.fit(avg_train_features, train_y) 
    classifier.fit(avg_train_features, train_y)
    train_predict_y = classifier.predict(avg_train_features)
    print(" Train data stats: ")
    print(confusion_matrix(train_y, train_predict_y))
    print(classification_report(train_y, train_predict_y))
    classifier_cv_scores = cross_val_score(classifier, avg_train_features, train_y, cv=5)
    print('CV Accuracy (5-fold):', classifier_cv_scores)
    classifier_cv_mean_score = np.mean(classifier_cv_scores)
    print('Mean CV Accuracy:', classifier_cv_mean_score)
    
    #print(" Test data stats: ")
    #predict_test_score = classifier.score(avg_test_features, test_y)
    #test_predict_y = classifier.predict(avg_test_features)
    #print('Test Accuracy:', predict_test_score)
    #print(test_predict_y.shape)
    #print(confusion_matrix(test_y, test_predict_y))
    #print(classification_report(test_y, test_predict_y))
    return

In [341]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x2.drop(["voted_up"], axis = 1), x2.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x2.voted_up, x2.drop(["voted_up"], axis = 1))
result=logit_model.fit()
print(result.summary2())



 Train data stats: 
[[2344 1724]
 [ 978 7003]]
              precision    recall  f1-score   support

       False       0.71      0.58      0.63      4068
        True       0.80      0.88      0.84      7981

    accuracy                           0.78     12049
   macro avg       0.75      0.73      0.74     12049
weighted avg       0.77      0.78      0.77     12049

CV Accuracy (5-fold): [0.74491912 0.76016598 0.78049793 0.81361561 0.76712329]
Mean CV Accuracy: 0.7732643833838265
Optimization terminated successfully.
         Current function value: 0.486915
         Iterations 6
                          Results: Logit
Model:                Logit            Pseudo R-squared: 0.239     
Dependent Variable:   voted_up         AIC:              11743.6842
Date:                 2020-04-22 16:19 BIC:              11780.6679
No. Observations:     12049            Log-Likelihood:   -5866.8   
Df Model:             4                LL-Null:          -7704.7   
Df Residuals:         12044



In [380]:
x3 = feature_extraction_2(df[["clean_words","review"]])
x3["voted_up"] = df["voted_up"]
x3["clean_words"] = df["clean_words"]
x3["review"] = df["voted_up"]
x3.dropna(how = 'any',inplace = True)
x3.shape
x3.drop(["index","clean_words"], axis = 1,inplace = True)
x3.head()

TypeError: float() argument must be a string or a number, not 'NoneType'

In [383]:
x3 = x2.copy()
x3.head()

Unnamed: 0,gameplay_sim,sound_design_sim,ambience_sim,story_sim,sentiment_polarity,voted_up
0,0.525,0.0,0.504,0.0,0.5046,True
1,0.0,0.0,0.744,0.505,0.9029,True
2,0.0,0.0,0.0,0.51,0.8689,True
3,0.0,0.752,0.721,0.0,0.904,True
4,0.0,0.0,0.541,0.0,0.6872,True


In [None]:
#x3.head()
for i in range(len(x3)):
    x3.loc[i,"gameplay_sim"] = get_sim("gameplay",x3.loc[i,"clean_words"])*(vader.polarity_scores(x3.loc[i,"review"])['compound'])
    x3.loc[i,"sound_design_sim"] = (get_sim("music",x3.loc[i,"clean_words"]))*(vader.polarity_scores(x3.loc[i,"review"])['compound'])
    x3.loc[i,"ambience_sim"] = (get_sim("ambience",x3.loc[i,"clean_words"]))*(vader.polarity_scores(x3.loc[i,"review"])['compound'])
    x3.loc[i,"story_sim"] = (get_sim("story",x3.loc[i,"clean_words"]))*(vader.polarity_scores(x3.loc[i,"review"])['compound'])

In [385]:
x3["gameplay_sim"] = x3["gameplay_sim"]*x3["sentiment_polarity"]
x3["sound_design_sim"] = x3["sound_design_sim"]*x3["sentiment_polarity"]
x3["ambience_sim"] = x3["ambience_sim"]*x3["sentiment_polarity"]
x3["story_sim"] = x3["story_sim"]*x3["sentiment_polarity"]
x3.head()

Unnamed: 0,gameplay_sim,sound_design_sim,ambience_sim,story_sim,sentiment_polarity,voted_up
0,0.133676,0.0,0.254318,0.0,0.5046,True
1,0.0,0.0,0.671758,0.455964,0.9029,True
2,0.0,0.0,0.0,0.443139,0.8689,True
3,0.0,0.679808,0.651784,0.0,0.904,True
4,0.0,0.0,0.371775,0.0,0.6872,True


In [387]:
# Encoding y variable (topic)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x3["voted_up"] = encoder.fit_transform(x3["voted_up"])
encoder_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(encoder_mapping)

{False: 0, True: 1}


In [413]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x2.drop(["voted_up","sentiment_polarity"], axis = 1), x2.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x2.voted_up, x2.drop(["voted_up","sentiment_polarity"], axis = 1))
result=logit_model.fit()
print(result.summary2())



 Train data stats: 
[[ 437 3631]
 [ 358 7623]]
              precision    recall  f1-score   support

       False       0.55      0.11      0.18      4068
        True       0.68      0.96      0.79      7981

    accuracy                           0.67     12049
   macro avg       0.61      0.53      0.49     12049
weighted avg       0.63      0.67      0.59     12049





CV Accuracy (5-fold): [0.64122771 0.66556017 0.57593361 0.67123288 0.68119552]
Mean CV Accuracy: 0.6470299751607617
Optimization terminated successfully.
         Current function value: 0.604015
         Iterations 6
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.055      
Dependent Variable: voted_up         AIC:              14563.5597 
Date:               2020-04-23 16:51 BIC:              14593.1466 
No. Observations:   12049            Log-Likelihood:   -7277.8    
Df Model:           3                LL-Null:          -7704.7    
Df Residuals:       12045            LLR p-value:      9.3363e-185
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     6.0000                                        
------------------------------------------------------------------
                    Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
------------------------------------------------------------------
game

In [414]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x2.drop(["voted_up"], axis = 1), x2.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x2.voted_up, x2.drop(["voted_up"], axis = 1))
result=logit_model.fit()
print(result.summary2())



 Train data stats: 
[[2344 1724]
 [ 978 7003]]
              precision    recall  f1-score   support

       False       0.71      0.58      0.63      4068
        True       0.80      0.88      0.84      7981

    accuracy                           0.78     12049
   macro avg       0.75      0.73      0.74     12049
weighted avg       0.77      0.78      0.77     12049





CV Accuracy (5-fold): [0.74491912 0.76016598 0.78049793 0.81361561 0.76712329]
Mean CV Accuracy: 0.7732643833838265
Optimization terminated successfully.
         Current function value: 0.486915
         Iterations 6
                          Results: Logit
Model:                Logit            Pseudo R-squared: 0.239     
Dependent Variable:   voted_up         AIC:              11743.6842
Date:                 2020-04-23 16:53 BIC:              11780.6679
No. Observations:     12049            Log-Likelihood:   -5866.8   
Df Model:             4                LL-Null:          -7704.7   
Df Residuals:         12044            LLR p-value:      0.0000    
Converged:            1.0000           Scale:            1.0000    
No. Iterations:       6.0000                                       
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------

In [415]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x3.drop(["voted_up","sentiment_polarity"], axis = 1), x3.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x3.voted_up, x3.drop(["voted_up","sentiment_polarity"], axis = 1))
result=logit_model.fit()
print(result.summary2())



 Train data stats: 
[[1111 2957]
 [ 456 7525]]
              precision    recall  f1-score   support

           0       0.71      0.27      0.39      4068
           1       0.72      0.94      0.82      7981

    accuracy                           0.72     12049
   macro avg       0.71      0.61      0.60     12049
weighted avg       0.71      0.72      0.67     12049

CV Accuracy (5-fold): [0.70136873 0.7219917  0.70829876 0.71191366 0.71149855]
Mean CV Accuracy: 0.7110142774670178
Optimization terminated successfully.
         Current function value: 0.536653
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.161     
Dependent Variable: voted_up         AIC:              12940.2698
Date:               2020-04-23 16:53 BIC:              12969.8568
No. Observations:   12049            Log-Likelihood:   -6466.1   
Df Model:           3                LL-Null:          -7704.7   
Df Residuals:       12045            L



In [417]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x3.drop(["voted_up"], axis = 1)[["sentiment_polarity"]], x3.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x3.voted_up, x3.drop(["voted_up"], axis = 1)[["sentiment_polarity"]])
result=logit_model.fit()
print(result.summary2())



 Train data stats: 
[[1992 2076]
 [ 670 7311]]
              precision    recall  f1-score   support

           0       0.75      0.49      0.59      4068
           1       0.78      0.92      0.84      7981

    accuracy                           0.77     12049
   macro avg       0.76      0.70      0.72     12049
weighted avg       0.77      0.77      0.76     12049

CV Accuracy (5-fold): [0.75528826 0.7593361  0.78962656 0.78953923 0.74719801]
Mean CV Accuracy: 0.7681976306201854
Optimization terminated successfully.
         Current function value: 0.505289
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.210     
Dependent Variable: voted_up         AIC:              12178.4485
Date:               2020-04-23 16:59 BIC:              12185.8452
No. Observations:   12049            Log-Likelihood:   -6088.2   
Df Model:           0                LL-Null:          -7704.7   
Df Residuals:       12048            L

In [410]:
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(n_estimators= 100)
classification_model(rfc2, x2.drop(["voted_up","sentiment_polarity"], axis = 1), x2.voted_up)
#rfc2.feature_importances_
feature_importances = pd.DataFrame(rfc2.feature_importances_,
                                   index = x2.drop(["voted_up","sentiment_polarity"], axis = 1).columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

 Train data stats: 
[[1055 3013]
 [ 264 7717]]
              precision    recall  f1-score   support

       False       0.80      0.26      0.39      4068
        True       0.72      0.97      0.82      7981

    accuracy                           0.73     12049
   macro avg       0.76      0.61      0.61     12049
weighted avg       0.75      0.73      0.68     12049

CV Accuracy (5-fold): [0.64330153 0.6813278  0.60373444 0.62930677 0.68119552]
Mean CV Accuracy: 0.6477732116803712


Unnamed: 0,importance
ambience_sim,0.424958
gameplay_sim,0.234414
sound_design_sim,0.171756
story_sim,0.168872
