# This notebook is set to visualize and model data gathered on game systems that are considered hot these days

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score

%matplotlib inline

# Working with xbox one data to predict what the Global_Sales will be

In [2]:
xone = pd.read_csv('xboxone.csv')
xone.head()

Unnamed: 0,Name,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Call of Duty: Black Ops 3,2015.0,Shooter,Activision,4.59,2.11,0.01,0.68,7.39
1,Grand Theft Auto V,2014.0,Action,Take-Two Interactive,2.81,2.19,0.0,0.47,5.48
2,Call of Duty: Advanced Warfare,2014.0,Shooter,Activision,3.22,1.55,0.01,0.48,5.27
3,Halo 5: Guardians,2015.0,Shooter,Microsoft Game Studios,2.78,1.27,0.03,0.41,4.48
4,Fallout 4,2015.0,Role-Playing,Bethesda Softworks,2.51,1.32,0.01,0.38,4.22


In [3]:
# Rounding the data so it cuts of those continuos variables
xone = xone.round()

In [4]:
# Dropping the column that contained less information
xone.drop(['Other_Sales'], axis=1, inplace=True)

In [5]:
# Initating the label encoder
le = LabelEncoder()

# Columns i need to labelencode before fitting to a model
s = (xone.dtypes == 'object')
object_cols = list(s[s].index)

# Label encoding the columns and replacing the old columns with the new encoded data
for i in object_cols:
    xone[i] = le.fit_transform(xone[i])
    

In [6]:
# Breaking up my data to then model
used_col = [c for c in xone.columns.tolist() if c not in ['Global_Sales']]
X, y = xone[used_col], xone.Global_Sales

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# Initiating my classification model to fit into cross validation
rfc = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=4, min_samples_split=6)


rfc.fit(X_train, y_train)

train_pred = rfc.predict(X_train)
test_pred = rfc.predict(X_test)

predictions = rfc.predict(X)

In [8]:
# Getting my scores for my predictions
print("Train MAE:", mean_absolute_error(train_pred, y_train))
print("Train R2:", r2_score(train_pred, y_train))

print("Test MAE:", mean_absolute_error(test_pred, y_test))
print("Test R2:", r2_score(test_pred, y_test))

Train MAE: 0.23827113951734608
Train R2: 0.8667668458386355
Test MAE: 0.28879757568012593
Test R2: 0.7129644585008492


In [9]:
xone['Predictions'] = predictions

In [10]:
# Viewing how my model did
xone

Unnamed: 0,Name,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Global_Sales,Predictions
0,30,2015.0,7,2,5.0,2.0,0.0,7.0,5.511135
1,91,2014.0,0,39,3.0,2.0,0.0,5.0,4.626261
2,29,2014.0,7,2,3.0,2.0,0.0,5.0,5.043435
3,93,2015.0,7,25,3.0,1.0,0.0,4.0,3.440034
4,74,2015.0,6,5,3.0,1.0,0.0,4.0,3.559623
...,...,...,...,...,...,...,...,...,...
242,202,2016.0,5,26,0.0,0.0,0.0,0.0,0.123564
243,184,2016.0,9,3,0.0,0.0,0.0,0.0,0.116987
244,243,2016.0,0,43,0.0,0.0,0.0,0.0,0.136326
245,156,2016.0,0,18,0.0,0.0,0.0,0.0,0.118093


In [13]:
print("Global sales for xbox one", xone.Global_Sales.sum().round(2))
print("Global Sales Predictions:", xone.Predictions.sum().round(2))
print(xone.Year_of_Release.min(), "-", xone.Year_of_Release.max())

Global sales for xbox one 138.0
Global Sales Predictions: 131.67
2013.0 - 2016.0


# Ps4 game system model predictions

In [14]:
ps4 = pd.read_csv('ps4.csv')
ps4.drop('level_0', axis=1, inplace=True)
ps4.head()

Unnamed: 0,Name,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Call of Duty: Black Ops 3,2015.0,Shooter,Activision,6.03,5.86,0.36,2.38,14.63
1,Grand Theft Auto V,2014.0,Action,Take-Two Interactive,3.96,6.31,0.38,1.97,12.61
2,FIFA 16,2015.0,Sports,Electronic Arts,1.12,6.12,0.06,1.28,8.57
3,Star Wars Battlefront (2015),2015.0,Shooter,Electronic Arts,2.99,3.49,0.22,1.28,7.98
4,Call of Duty: Advanced Warfare,2014.0,Shooter,Activision,2.81,3.48,0.14,1.23,7.66


In [15]:
ps4 = ps4.round(2)

In [16]:
p = (ps4.dtypes == 'object')
label_cols = list(p[p].index)

# Label encoding the columns and replacing the old columns with the new encoded data
for e in label_cols:
    ps4[e] = le.fit_transform(ps4[e])

In [17]:
p_col = [c for c in ps4.columns.tolist() if c not in ['Global_Sales']]
X_p, y_p = ps4[used_col], ps4.Global_Sales

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_p, y_p)

In [18]:
rfc_ps4 = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=4, min_samples_split=6)


rfc_ps4.fit(X_train_p, y_train_p)

train_pred_p = rfc_ps4.predict(X_train_p)
test_pred_p = rfc_ps4.predict(X_test_p)

predictions_p = rfc_ps4.predict(X_p)

In [19]:
print("Train MAE:", mean_absolute_error(train_pred_p, y_train_p))
print("Train R2:", r2_score(train_pred_p, y_train_p))

print("Test MAE:", mean_absolute_error(test_pred_p, y_test_p))
print("Test R2:", r2_score(test_pred_p, y_test_p))

Train MAE: 0.09010560612713912
Train R2: 0.971876735038574
Test MAE: 0.07671772028891258
Test R2: 0.9857341449962435


In [20]:
ps4['Predictions'] = predictions_p

In [22]:
print("Global sales for ps4", ps4.Global_Sales.sum().round(2))
print("Global Sales Predictions:", ps4.Predictions.sum().round(2))
print(ps4.Year_of_Release.min(), "-", ps4.Year_of_Release.max())

Global sales for ps4 314.23
Global Sales Predictions: 312.3
2013.0 - 2017.0


# Importing N64 game system and feeding it to a model

In [None]:
n64 = pd.read_csv('n64.csv')

n64.head(5)

In [None]:
# Checking for the type categorical type columns
n = (n64.dtypes == 'object')
n_cols = list(n[n].index)

# Label encoding the columns and replacing the old columns with the new encoded data
for h in n_cols:
    n64[h] = le.fit_transform(n64[h])

In [None]:
# Splitting my data to fit into a regressin model
n_col = [c for c in n64.columns.tolist() if c not in ['Global_Sales']]
X_n, y_n = n64[n_col], n64.Global_Sales

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n, y_n)

In [None]:
rfc_n64 = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=4, min_samples_split=6)


rfc_n64.fit(X_train_n, y_train_n)

train_pred_n = rfc_n64.predict(X_train_n)
test_pred_n = rfc_n64.predict(X_test_n)

predictions_n = rfc_n64.predict(X_n)

In [None]:
# Viewing my scores for my train and test data
print("Train MAE:", mean_absolute_error(train_pred_n, y_train_n))
print("Train R2:", r2_score(train_pred_n, y_train_n))

print("Test MAE:", mean_absolute_error(test_pred_n, y_test_n))
print("Test R2:", r2_score(test_pred_n, y_test_n))

In [None]:
# Adding my predictions back to my dataframe
n64['Predictions'] = predictions_n

In [None]:
n64

In [None]:
# Getting the date range from when the games where released and how much it sold in total between those years
print(n64.Year_of_Release.min(), "-", n64.Year_of_Release.max())

print("Total Sales in millions per unit", n64.Global_Sales.sum().round(2))

In [None]:
# Getting the predicion sum to compare to the whole data

print("What my model predictions is:", n64.Predictions.sum().round(2))

# Viewing the sales for Game Cube

In [None]:
GameCube = pd.read_csv('gc.csv')

GameCube.head()

In [None]:
g = (GameCube.dtypes == 'object')
g_cols = list(g[g].index)

# Label encoding the columns and replacing the old columns with the new encoded data
for c in g_cols:
    GameCube[c] = le.fit_transform(GameCube[c])

In [None]:
# Breaking up my data
g_col = [c for c in GameCube.columns.tolist() if c not in ['Global_Sales']]
X_g, y_g = GameCube[g_col], GameCube.Global_Sales

X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_g, y_g)

In [None]:
rfc_Gc = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=4, min_samples_split=6)


rfc_Gc.fit(X_train_g, y_train_g)

train_pred_g = rfc_Gc.predict(X_train_g)
test_pred_g = rfc_Gc.predict(X_test_g)

predictions_g = rfc_Gc.predict(X_g)

In [None]:
print("Train MAE:", mean_absolute_error(train_pred_g, y_train_g))
print("Train R2:", r2_score(train_pred_g, y_train_g))

print("Test MAE:", mean_absolute_error(test_pred_g, y_test_g))
print("Test R2:", r2_score(test_pred_g, y_test_g))

In [None]:
GameCube['Predictions'] = predictions_g

In [None]:
GameCube.Global_Sales.plot()
GameCube.Predictions.plot(figsize=(16,10));

In [None]:
# Grabbing the range the video games released and how much sales where made during that time frame
print(GameCube.Year_of_Release.min(), "-", GameCube.Year_of_Release.max())

print("Total Sales in millions per unit", GameCube.Global_Sales.sum().round(2))
print("Total Sales in millions per unit", GameCube.Predictions.sum().round(2))