---

### 1.Problem Definition

Predict the Global_Sale
1. Analyse the importance of features that can be of help in increasing the global sales

---

### 2.Dataset Preview

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

In [None]:
videoGames=pd.read_csv("Video_Games.csv")
videoGames.head()

In [None]:
videoGames.info()

---

### 3.Data Cleaning / Preparation

In [None]:
videoGamesSelected=pd.DataFrame(videoGames[['Platform','Genre','Publisher','Critic_Score','Global_Sales','User_Score','Rating']])
videoGamesSelected = videoGamesSelected[videoGamesSelected.User_Score != "tbd"]
videoGamesSelected["User_Score"] = pd.to_numeric(videoGamesSelected["User_Score"])
videoGamesSelected

In [None]:
for i in videoGamesSelected:
    print(i, videoGamesSelected[i].isnull().sum())

In [None]:
videoGamesSelectedCleaned=pd.DataFrame(videoGamesSelected[~videoGamesSelected
                                                          ['Critic_Score'].isnull() &
                                                         ~videoGamesSelected
                                                          ['Publisher'].isnull() &
                                                         ~videoGamesSelected
                                                          ['Genre'].isnull() &
                                                          ~videoGamesSelected
                                                          ['User_Score'].isnull() &
                                                          ~videoGamesSelected
                                                          ['Rating'].isnull()])
videoGamesSelectedCleaned

In [None]:
for i in videoGamesSelectedCleaned:
    print(i, videoGamesSelectedCleaned[i].isnull().sum())

---

### 4.Data Analysis / Visualisation

## For Rating

In [None]:
videoGamesSelectedCleaned['Rating'].unique()

## For Genre

In [None]:
videoGamesSelectedCleaned['Genre'].unique()

## For Publisher

In [None]:
videoGamesSelectedCleaned['Publisher'].nunique()
videoGamesSelectedCleaned['Publisher'].unique()

In [None]:
videoGamesPublisher=videoGamesSelectedCleaned.groupby('Publisher')
videoGamesPublisher=videoGamesPublisher.filter(lambda x:len(x)>=50)
videoGamesPublisher

In [None]:
f=plt.figure(figsize=(21,21))
sb.boxplot(x='Global_Sales', y = 'Publisher', orient='h', data = videoGamesPublisher)

In [None]:
def right(s):
    Q1=s.quantile(0.25)
    Q3=s.quantile(0.75)
    IQR=Q3-Q1
    return Q3+IQR
    
def left(s):
    Q1=s.quantile(0.25)
    Q3=s.quantile(0.75)
    IQR=Q3-Q1
    return Q1-IQR

## For companies with over 50 sales

In [None]:
def removeOutliers(df,predictor,response):
    videoGamesPublisherGrouped=df.groupby(predictor)#['Global_Sales']
    whiskers=videoGamesPublisherGrouped[response].apply(lambda x: [left(x),right(x)]).apply(pd.Series)
    whiskers.columns=['left','right']
    merge=pd.merge(df,whiskers,on=predictor)
    videoGamesPublisherGrouped=merge[(merge[response]<=(merge['right'])) &
                                     (merge[response]>=(merge['left']))]
    f=plt.figure(figsize=(21,21))
    sb.boxplot(x=response, y = predictor, orient='h', data = videoGamesPublisherGrouped, 
          order= videoGamesPublisherGrouped.groupby(predictor)[response].median().sort_values(ascending=False).index)

## not complete but I think this can only be applied to numeric data i.e. critic score
def checkSkew(df,predictor):
  groupedDf=df.groupby(predictor)
  uniquePredictors=df[predictor].unique()
  for i in uniquePredictors:
    print(i, groupedDf[i].skew())

In [None]:
removeOutliers(videoGamesPublisher,'Publisher','Global_Sales')

## reasonable distinctness?

## For Genre

In [None]:
videoGamesSelectedCleaned['Genre'].nunique()

In [None]:
f=plt.figure(figsize=(21,21))
sb.boxplot(x='Global_Sales', y = 'Genre', orient='h', data = videoGamesSelectedCleaned, 
          order= videoGamesSelectedCleaned.groupby('Genre')['Global_Sales'].median().sort_values(ascending=False).index)


In [None]:
removeOutliers(videoGamesSelectedCleaned,'Genre','Global_Sales')

## reasonable distinctness?

## For platform

In [None]:
videoGamesSelectedCleaned['Platform'].nunique()

In [None]:
f=plt.figure(figsize=(21,21))
sb.boxplot(x='Global_Sales', y = 'Platform', orient='h', data = videoGamesSelectedCleaned, 
          order= videoGamesSelectedCleaned.groupby('Platform')['Global_Sales'].median().sort_values(ascending=False).index)


In [None]:
removeOutliers(videoGamesSelectedCleaned,'Platform','Global_Sales')

## somewhat promising

## For critic score

In [None]:
scoreSales=videoGamesSelectedCleaned[['Critic_Score','Global_Sales']]

In [None]:
scoreSales.corr()

In [None]:
plt.figure(figsize=(20, 20))
sb.scatterplot(videoGamesSelectedCleaned, y = "Critic_Score", x = "Global_Sales")

In [None]:
sb.kdeplot(data=scoreSales['Critic_Score'])

## got .25 corr not great


# userscore corr

In [None]:
scoreSales=videoGamesSelectedCleaned[['User_Score','Global_Sales']]
scoreSales.corr()


# predict global sales by critic score



In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Mean Squared Error (MSE)
def mean_sq_err(actual, predicted):
    '''Returns the Mean Squared Error of actual and predicted values'''
    return np.mean(np.square(np.array(actual) - np.array(predicted)))

# Create a Linear Regression object
linreg = LinearRegression()



train = videoGamesSelectedCleaned.sample(frac=.8)
test = videoGamesSelectedCleaned.drop(train.index)


cs_train = pd.DataFrame(train['Critic_Score'])
cs_test = pd.DataFrame(test['Critic_Score'])
gs_train = pd.DataFrame(train['Global_Sales'])
gs_test = pd.DataFrame(test['Global_Sales'])


In [None]:

linreg.fit(cs_train, gs_train)
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
gs_train_pred = linreg.predict(cs_train)
gs_test_pred = linreg.predict(cs_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(cs_train, gs_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(gs_train, gs_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(cs_test, gs_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(gs_test, gs_test_pred))
print()


f = plt.figure(figsize=(16, 8))
plt.scatter(cs_train, gs_train)
plt.scatter(cs_train, gs_train_pred, color = "r")
plt.show()

# Predicting Rating by global sales

In [None]:
ra_train = pd.DataFrame(train['Rating'])
ra_test = pd.DataFrame(test['Rating'])
# Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree Classifier object

dectree2 = DecisionTreeClassifier(max_depth = 8)

# Train the Decision Tree Classifier model

dectree2.fit(gs_train,ra_train)



from sklearn.tree import export_graphviz
dot_data2 = export_graphviz(dectree2, out_file=None, 
                                feature_names = gs_train.columns,
                                filled=True,
                               class_names=['E', 'M', 'T', 'E10+', 'AO', 'K-A', 'RP'])


import graphviz                          
graphviz.Source(dot_data2)

train_pred2 = dectree2.predict(gs_train)
test_pred2 = dectree2.predict(gs_test)

print("Train Classification Accuracy:",dectree2.score(gs_train,ra_train))
print("Test Classification Accuracy:",dectree2.score(gs_test,ra_test))
from sklearn.metrics import confusion_matrix
f, axes = plt.subplots(1, 2, figsize=(24, 6))
train_m = confusion_matrix(ra_train,train_pred2)
test_m = confusion_matrix(ra_test,test_pred2)
sb.heatmap(train_m,annot = True, fmt=".0f", annot_kws={"size": 18},ax = axes[0])
sb.heatmap(test_m,annot = True, fmt=".0f", annot_kws={"size": 18},ax = axes[1])
plt.show()




## Predicting Rating by Genre and globalsales

In [None]:


gegs_train = pd.DataFrame(train[['Genre','Global_Sales']])
gegs_test = pd.DataFrame(test[['Genre','Global_Sales']])
type(gegs_train)
gegs_train['Genre'].replace(['Sports', 'Racing', 'Platform', 'Misc', 'Action', 'Puzzle',
       'Shooter', 'Fighting', 'Simulation', 'Role-Playing', 'Adventure',
       'Strategy'],[0,1,2,3,4,5,6,7,8,9,10,11], inplace=True)

gegs_test['Genre'].replace(['Sports', 'Racing', 'Platform', 'Misc', 'Action', 'Puzzle',
       'Shooter', 'Fighting', 'Simulation', 'Role-Playing', 'Adventure',
       'Strategy'],[0,1,2,3,4,5,6,7,8,9,10,11], inplace=True)

dectree2.fit(gegs_train,ra_train)

dot_data2 = export_graphviz(dectree2, out_file=None, 
                                feature_names = gegs_train.columns,
                                filled=True,
                               class_names=['E', 'M', 'T', 'E10+', 'AO', 'K-A', 'RP'])

train_pred2 = dectree2.predict(gegs_train)
test_pred2 = dectree2.predict(gegs_test)
print("Train Classification Accuracy:",dectree2.score(gegs_train,ra_train))
print("Test Classification Accuracy:",dectree2.score(gegs_test,ra_test))
f, axes = plt.subplots(1, 2, figsize=(24, 6))
train_m = confusion_matrix(ra_train,train_pred2)
test_m = confusion_matrix(ra_test,test_pred2)
sb.heatmap(train_m,annot = True, fmt=".0f", annot_kws={"size": 18},ax = axes[0])
sb.heatmap(test_m,annot = True, fmt=".0f", annot_kws={"size": 18},ax = axes[1])
plt.show()

In [None]:
print("helllo world")