# Predicting High Global Sales for Video Games

In [2]:
import pandas as pd

%matplotlib inline

In [5]:
df = pd.read_csv('data/video.csv')

In [4]:
df.to_csv("data/video.csv")

In [6]:
df["Global_Sales"].describe()

count    16717.000000
mean         0.533462
std          1.547956
min          0.010000
25%          0.060000
50%          0.170000
75%          0.470000
max         82.530000
Name: Global_Sales, dtype: float64

In [7]:
features = ['Critic_Score', 'User_Score', 'User_Count', 
       'Action', 'Adventure', 'Fighting', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports', 'Strategy']

In [8]:
dummies = pd.get_dummies(df['Genre'])
df = pd.concat([df, dummies], axis=1)
df["hit"] = [1 if i > 1 else 0 for i in df["Global_Sales"]]

In [None]:
# Compare Algorithms
import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, f1_score


df=df.dropna()
X = df[features]  
Y = df['hit'] 
# prepare configuration for cross validation test harness
seed = 4373
# prepare models
models = []

models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=10000)))
models.append(('DT', DecisionTreeClassifier(max_depth=6)))
models.append(('RF', RandomForestClassifier(max_depth=6)))
models.append(('Ada', AdaBoostClassifier()))
models.append(('NB', GaussianNB()))
models.append(('KMeans', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('MLP', MLPClassifier(hidden_layer_sizes=(10,10,10),max_iter=5000)))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean():.3f} ({cv_results.std():.3f})"
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison of Model Accuracy')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# Let's load the packages

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df[features]
y = df["hit"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4373)

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

rf.feature_importances_

plt.barh(features, rf.feature_importances_)