In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', 100)

from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

import re
import string
import nltk
import xgboost

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

import sys
if not 'Notebooks/Individual/Jake' in sys.path:
    sys.path.append('Notebooks/Individual/jake')
from functions import ScoreModel, CleanText

import IPython
sound_file = '../../../data/sounds/puzzle_solved_jingle.wav'

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, plot_confusion_matrix

from textblob import TextBlob, Word
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ultim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ultim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("../../../data/fps_main.csv")
df.head(3)

Unnamed: 0,steamid,appid,app_title,app_tags,review,fps,voted_up
0,76561198865552498,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...",very good game,True,1
1,76561197964533061,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...",beause S,True,1
2,76561198290998839,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...","Russians everywhere, do not recommend",True,1


In [3]:
df["voted_up"].value_counts(normalize=True)

0    0.515091
1    0.484909
Name: voted_up, dtype: float64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19680 entries, 0 to 19679
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   steamid    19680 non-null  int64 
 1   appid      19680 non-null  int64 
 2   app_title  19680 non-null  object
 3   app_tags   19680 non-null  object
 4   review     19680 non-null  object
 5   fps        19680 non-null  bool  
 6   voted_up   19680 non-null  int64 
dtypes: bool(1), int64(3), object(3)
memory usage: 941.8+ KB


In [5]:
df['clean_review'] = df['review'].apply(lambda x: CleanText(x.lower()))

df.head()

Unnamed: 0,steamid,appid,app_title,app_tags,review,fps,voted_up,clean_review
0,76561198865552498,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...",very good game,True,1,"[good, game]"
1,76561197964533061,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...",beause S,True,1,"[, beause]"
2,76561198290998839,730,Counter-Strike: Global Offensive,"['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action', 'Team-Based', 'eSports', 'Tactical', ...","Russians everywhere, do not recommend",True,1,"[russian, everywhere, recommend]"
3,76561198073021168,60,Ricochet,"['Action', 'FPS', 'Multiplayer', 'Classic', 'First-Person', 'Sci-fi', 'Shooter', 'Space', 'Cyber...","best game, best game, 10/10 i r8 8/8",True,1,"[best, game, best, game, 1010, r8, 88]"
4,76561198061142423,550,Left 4 Dead 2,"['Zombies', 'Co-op', 'FPS', 'Multiplayer', 'Shooter', 'Action', 'Online Co-Op', 'Survival', 'Hor...",køb hvis du kan lide zombie spil ;D,True,1,"[køb, hvis, du, kan, lide, zombie, spil]"


In [6]:
vector = TfidfVectorizer(analyzer=CleanText, ngram_range=(2, 2))
X = vector.fit_transform(df["review"])

X_df = pd.DataFrame(X.toarray())
X_df.columns = vector.get_feature_names()
X_df.head(3)

Unnamed: 0,Unnamed: 1,0,00,000,00000,00001,001,00110,001500,001604,...,𝓲𝓼,𝓸𝓯,𝓻𝓾𝓼𝓼𝓲𝓪𝓷𝓼,𝓽𝓸𝔁𝓲𝓬,𝘎𝘳𝘦𝘢𝘵,𝘣𝘺,𝘤𝘩𝘦𝘢𝘵𝘦𝘳𝘴,𝘨𝘢𝘮𝘦,𝘳𝘶𝘪𝘯𝘦𝘥,𝙴ѕєαAccσυηт
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.248761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
y = df["voted_up"]

X_train, X_test, y_train, y_test = train_test_split(X_df, y, random_state=57)

first_half_model = RandomForestClassifier(n_estimators=100, random_state=57)
first_half_model.fit(X_train, y_train)

RandomForestClassifier(random_state=57)

In [8]:
ScoreModel(first_half_model, X_train, y_train)

Accuracy:   0.9787262872628726
F1 Score:   0.9782879269810537
Recall:     0.9857859531772575
Precision:  0.9709031018391435
ROC_AUC:    0.9789161833396837


In [9]:
ScoreModel(first_half_model, X_test, y_test)

Accuracy:   0.8063008130081301
F1 Score:   0.7877032746714191
Recall:     0.7469370511195607
Precision:  0.8331762488218661
ROC_AUC:    0.8041383257947978


In [12]:
sums = X.sum(axis = 0) 
data1 = [] 
for col, term in enumerate(X_df.columns): 
    data1.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data1, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print ("\n\nWords : \n", words.head(10))



Words : 
          term         rank
15128    game  1385.235976
0               700.146971
15569    good   677.309826
4435        I   525.829665
15012     fun   363.984484
20653    play   349.164162
22986    shit   280.036333
15949  hacker   255.485229
18037    like   248.389289
15397     get   243.612005


In [None]:
boost_model = XGBClassifier(random_state=57, objective="reg:logistic", use_label_encoder=False)

param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [2, 3, 4],
    'min_child_weight': [2, 3, 4],
    'subsample': [0.5, 0.6, 0.7],
    'n_estimators': [50, 100]
}

gridsearch = GridSearchCV(boost_model, param_grid, cv=3, scoring="accuracy", n_jobs=1)
gridsearch.fit(X_train, y_train)

In [None]:
best_parameters = gridsearch.best_params_
print("Best Parameters: ")
print(best_parameters)

In [None]:
print("Training Scores")
print(ScoreModel(gridsearch, X_train, y_train))

print("Test Scores")
print(ScoreModel(gridsearch, X_test, y_test))


In [None]:
IPython.display.Audio(sound_file, autoplay=True, rate=1000)