In [460]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import chi2_contingency
import math

import sklearn
from sklearn import metrics, preprocessing, model_selection, preprocessing, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

## Reading the new 'df_for_ml.csv' file

In [461]:
df = pd.read_csv("games_data/df_for_ml.csv")

# Deletion of the index column - "Unnamed: 0"
df.drop("Unnamed: 0", axis=1, inplace=True)

--------------
Checking the data

In [462]:
df.head()

Unnamed: 0,game_name,platform,genres,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,exclusive_game,publisher_labeled,develeoper_labeled,genres_labeled,platform_labeled,successful
0,The Legend of Zelda: Ocarina of Time,Nintendo,"Action Adventure, Fantasy",Nintendo,Nintendo,1,0,1998,11,1,1.0,0.93617,1,1140,2247,383,0,1
1,Tony Hawk's Pro Skater 2,"Nintendo, Old Platform, PC, PlayStation","Alternative, Skateboarding, Sports",Neversoft Entertainment,Activision,2,0,2000,9,3,0.931818,0.765957,0,62,2213,1096,3,1
2,Grand Theft Auto IV,"PC, PlayStation, Xbox","Action Adventure, Modern, Modern, Open-World",Rockstar North,Rockstar Games,1,0,2008,4,4,0.958333,0.780142,0,1417,2787,463,43,1
3,SoulCalibur,"Old Platform, Xbox","3D, Action, Fighting",Namco,Namco,2,0,1999,9,3,0.880682,0.829787,0,1109,2176,231,38,1
4,Super Mario Galaxy,Wii,"3D, 3D, Action, Platformer, Platformer",Nintendo,Nintendo,0,0,2007,11,1,0.977273,0.93617,1,1140,2247,144,51,1


----------

A helper function:

In [463]:
def print_results( y_test, y_pred, round_number, only=""):
    
    if(only == "accuracy"):
        print("accuracy is:", round(metrics.accuracy_score(y_test, y_pred), round_number))
    elif(only == "precision"):
        print("precision is:", round(metrics.precision_score(y_test, y_pred), round_number))
    elif(only == "recall"):
        print("recall is:", round(metrics.recall_score(y_test, y_pred), round_number))
    elif(only == "recall"):
        print("f1 is:", round(f1_score(y_test, y_pred), round_number))
    else:
        print("accuracy is:", round(metrics.accuracy_score(y_test, y_pred), round_number))
        print("precision is:", round(metrics.precision_score(y_test, y_pred), round_number))
        print("recall is:", round(metrics.recall_score(y_test, y_pred), round_number))
        print("f1 is:", round(f1_score(y_test, y_pred), round_number))
    
    print("\nconfusion matrix:")
    mat = metrics.confusion_matrix(y_test, y_pred)
    print(f"TP={mat[1][1]} | FN={mat[1][0]}")
    print(f"FP={mat[0][1]} | TN={mat[0][0]}")
    print("-------------")

In [464]:
def train_and_print(model, X_train, X_test, y_train, y_test, round_number, title, only="", check_train=False):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    print(f"'{title.title()}': \n")
    print("-------------")
    print("Test:")
    print_results( y_test, y_pred, round_number, only=only)
       
    if(check_train):
        print("-------------")
        print("Train:")
        y_pred_train = model.predict(X_train)
        print_results( y_train, y_pred_train, round_number, only=only)
    # print("model: ", log_user.coef_[0])
    return model, y_pred

In [465]:
df_for_ml["successful"].value_counts()

1    6184
0    5511
Name: successful, dtype: int64

## Logistic regression:

In [466]:
df_for_ml = df.iloc[:, 5:].copy()
df_for_ml["successful"] = (df_for_ml["user_score"] >= 0.73).astype("int64")

y = df_for_ml["successful"]
X = df_for_ml.copy()
to_del = ["user_score", "meta_score", "successful"]
X.drop(to_del, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [467]:
log = LogisticRegression(max_iter=500)

log, y_pred = train_and_print(log, X_train, X_test, y_train, y_test, 3, "successful", check_train=True)

'Successful': 

-------------
Test:
accuracy is: 0.526
precision is: 0.53
recall is: 0.4
f1 is: 0.456

confusion matrix:
TP=748 | FN=1122
FP=663 | TN=1236
-------------
-------------
Train:
accuracy is: 0.531
precision is: 0.523
recall is: 0.416
f1 is: 0.463

confusion matrix:
TP=1780 | FN=2503
FP=1621 | TN=2890
-------------


---------

<a id='Improving_Performance'></a>
## <font color='red'>Problem2:</font> Improving Performance

##  We will try to improve the score by returning to the data handling stage
(We scraped all the possible data from the website already)

## So we came back to the Data Handling stage and made some changes:
### We have also found that we have some duplicates in the 'genres' culomn 
(see: [data_handling](./3_data_handling.ipynb) => <font color='red'>'Problem2: Improving Performance'</font>)

We did the following:

- Dropping the nan values from 'user_score' and Not filling the nan values with the mean values
- Handling Duplicates in 'genres'

### All changes were saved in a new file: 'handled_games2.csv'
### and then in the EDA stage we saved them in file: 'df_for_ml2.csv'

Now we will read it and perform the machine learning  again

In [468]:
df = pd.read_csv("games_data/df_for_ml2.csv")

# Deletion of the index column - "Unnamed: 0"
df.drop("Unnamed: 0", axis=1, inplace=True)

--------------
Checking the data

In [469]:
df.head()

Unnamed: 0,game_name,platform,genres,develeoper,publisher,max_players,online_game,release_year,release_month,rating,meta_score,user_score,exclusive_game,publisher_labeled,develeoper_labeled,genres_labeled,platform_labeled,successful
0,The Legend of Zelda: Ocarina of Time,Nintendo,"action adventure, fantasy",Nintendo,Nintendo,1,0,1998,11,1,1.0,0.93617,1,1070,2078,262,0,1
1,Tony Hawk's Pro Skater 2,"Nintendo, Old Platform, PC, PlayStation","alternative, skateboarding, sports",Neversoft Entertainment,Activision,2,0,2000,9,3,0.931818,0.765957,0,59,2045,805,3,1
2,Grand Theft Auto IV,"PC, PlayStation, Xbox","action adventure, modern, open-world",Rockstar North,Rockstar Games,1,0,2008,4,4,0.958333,0.780142,0,1323,2572,323,43,1
3,SoulCalibur,"Old Platform, Xbox","3d, action, fighting",Namco,Namco,2,0,1999,9,3,0.880682,0.829787,0,1042,2013,143,38,1
4,Super Mario Galaxy,Wii,"3d, action, platformer",Nintendo,Nintendo,0,0,2007,11,1,0.977273,0.93617,1,1070,2078,154,51,1


# Our main model:

In [470]:
df_for_ml = df.iloc[:, 5:].copy()
X = df_for_ml.copy()
y = df_for_ml["successful"]
to_del = ["user_score", "meta_score", "successful"]
X.drop(to_del, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

## Logistic regression:

In [471]:
log = LogisticRegression(max_iter=500)

log, y_pred = train_and_print(log, X_train, X_test, y_train, y_test, 3, "successful", check_train=True)

'Successful': 

-------------
Test:
accuracy is: 0.535
precision is: 0.54
recall is: 0.75
f1 is: 0.628

confusion matrix:
TP=1379 | FN=459
FP=1173 | TN=498
-------------
-------------
Train:
accuracy is: 0.535
precision is: 0.545
recall is: 0.741
f1 is: 0.628

confusion matrix:
TP=3220 | FN=1126
FP=2684 | TN=1156
-------------


### The results has improved

In [472]:
print("Single test1:\n")
print("real value: ",y_test.iloc[0])

tst = X_test.iloc[0:1]
print("predict value: ", log.predict(tst)[0])
print("--------------\n")

print("Single test2:\n")
print("real value: ", y_test.iloc[10])

tst2 = X_test.iloc[10:11]
print("predict value: ", log.predict(tst2)[0])

Single test1:

real value:  0
predict value:  0
--------------

Single test2:

real value:  1
predict value:  1


-------------------

# We will try other models that seem suitable for solving the problem

## K Nearest Neighbors (k-NN):

In [399]:
to_num = int(math.sqrt(len(y_train)))
to_num

90

In [400]:
parameters = {'n_neighbors':range(3,to_num,2) }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, scoring=make_scorer(metrics.f1_score, greater_is_better=True))
clf.fit(X_train, y_train)

print("best parameter set is:",clf.best_params_," and its score was",clf.best_score_)

best parameter set is: {'n_neighbors': 73}  and its score was 0.6146720619480276


In [401]:
y_pred = clf.predict(X_test)
print("Test KNN:")
print_results( y_test, y_pred, 3)
y_pred_train = clf.predict(X_train)
print("Train KNN:")
print_results( y_train, y_pred_train, 3)

Test KNN:
accuracy is: 0.537
precision is: 0.546
recall is: 0.685
f1 is: 0.608

confusion matrix:
TP=1259 | FN=579
FP=1046 | TN=625
-------------
Train KNN:
accuracy is: 0.578
precision is: 0.585
recall is: 0.707
f1 is: 0.64

confusion matrix:
TP=3073 | FN=1273
FP=2181 | TN=1659
-------------


### The results has improved in accuracy and  precision

### But the recall and f1 dropped

--------------

## Decision Trees

In [411]:
parameters = {'max_depth':range(2, 100, 2), "min_samples_split":range(2, 50, 2) }
dt = tree.DecisionTreeClassifier()

clf = GridSearchCV(dt, parameters,scoring=make_scorer(metrics.accuracy_score, greater_is_better=True))
clf.fit(X_train, y_train)
print("best parameter set is:",clf.best_params_," and its score was",clf.best_score_)

best parameter set is: {'max_depth': 2, 'min_samples_split': 2}  and its score was 0.6086003387774921


In [412]:
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)
print("Test DecisionTree:\n")
print_results( y_test, y_pred, 3)
print("Train DecisionTree:\n")
print_results( y_train, y_pred_train, 3)

Test DecisionTree:

accuracy is: 0.625
precision is: 0.632
recall is: 0.68
f1 is: 0.655

confusion matrix:
TP=1249 | FN=589
FP=726 | TN=945
-------------
Train DecisionTree:

accuracy is: 0.615
precision is: 0.627
recall is: 0.678
f1 is: 0.651

confusion matrix:
TP=2947 | FN=1399
FP=1756 | TN=2084
-------------


### The results has improved in f1, accuracy and  precision
### But the recall dropped

## conclusion:

After many more attempts, we were unable to significantly improve the prediction results

From this we concluded that with the material we learned in the course, we have a limited ability to predict whether a specific game will be successful or not