In [1]:
# Importe les librairies utiles
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Chemin vers les csv
pathTest = "./test3.csv"
pathTrain = "./train3.csv"

# Importe les jeux de donnees
df_test = pd.read_csv(pathTest, sep=',', index_col=0)
df_train = pd.read_csv(pathTrain, sep=',', index_col=0)

In [2]:
"""
--- Variables Explicatives ---
Name : Nom du jeu
Platform : Console sur laquelle le jeu fonctionne
Year of release : Année de sortie du jeu
Genre
Publisher
JP_sales : Nombre de ventes du jeu au Japon en millions d’unités
Other sales : Nombre de ventes du jeu ailleurs dans le monde : Afrique, Asie sans le Japon, Europe sans l’Union Européenne et Amérique du Sud en millions d’unités
Critic_score : Score donné par Metacritic
Critic_count : Nombre de critiques prises en compte pour estimer le Critic_score
User_Score : Score donné par les usagers de Metacritic
User_Count : Nombre d’usagers considérés pour estimer le User_Score
Developer : Compagnie créatrice du jeu
Rating : Classement ESRB (Entertainment Software Rating Board) ie à qui s’addresse le jeu (tout public, majeur, adolescents, etc) 

--- Variable d'interet ---
NA_sales : Nombre de ventes du jeu en Amérique du Nord en millions d’unités
Global_Sales : Nombre de ventes total du jeu en millions d’unités
"""

df_train

Unnamed: 0_level_0,Name,Platform,Year_of_Release,Genre,Publisher,JP_Sales,Other_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,NA_Sales,Global_Sales
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,Zoo Keeper,DS,2004.0,Puzzle,Ignition Entertainment,0.050524,0.010024,74.0,40.0,7.5,13.0,Buddiez- Inc.,E,0.101432,0.172605
2,Do-Konjou Shougakussei: Bon Bita - Hadaka no C...,DS,2010.0,Action,Namco Bandai Games,0.039211,0.001791,,,,,,,0.000440,0.043251
3,Power Play Pool,DS,2006.0,Sports,System 3 Arcade Software,0.000374,0.001797,,,,,System 3,E,0.030829,0.041271
4,SpongeBob SquarePants: Revenge of the Flying D...,PS2,2002.0,Platform,THQ,0.001382,0.140010,,,6.6,41.0,Big Sky Software,E,0.528330,1.080821
5,Deception IV: Blood Ties,PSV,2014.0,Action,Tecmo Koei,0.069422,0.009535,67.0,19.0,7.5,61.0,Koei Tecmo Games,M,0.020629,0.131074
6,Binary Domain,PS3,2012.0,Action,Sega,0.140390,0.030271,72.0,33.0,8.2,179.0,Sega,M,0.089060,0.331157
7,Jikkyou Powerful Pro Yakyuu 3 '97 Haru,SNES,1997.0,Sports,Konami Digital Entertainment,0.161401,0.000962,,,,,,,0.000934,0.164147
8,Monsters vs. Aliens,X360,2009.0,Action,Activision,0.001686,0.011419,63.0,31.0,6.6,7.0,Beenox,E10+,0.121894,0.168837
9,Shugo Chara! Amunonijiro Chara Change,DS,2008.0,Action,Konami Digital Entertainment,0.031413,0.000182,,,,,,,0.000438,0.032830
10,Buzz! Junior: Jungle Party,PS2,2006.0,Misc,Sony Computer Entertainment,0.000294,0.051709,,,,,,,0.198990,0.412267


In [3]:
"""
--- Evaluation des predictions --- 

Dans le document MTH3302_CriteresProjet-1.pdf on nous informe que la precision
de nos estimations sera evaluee avec le root mean square error (RMSE)

On definit cette fonction ci-bas,

    Y : Variable d'interet
    W : Predictions

"""

def RMSE(Y,W):
    
    # Nombre d'observations
    n = len(Y)
    
    total = 0.0
    for i in range(n):
        total += (Y[i] - W[i])**2
    
    mean = total/float(n)
    
    print("RMSE = %.2f" % (mean))
    

In [4]:
"""
--- Variable Qualitative -> Table Binaire --- 

Afin de traiter nos variables qualitatives, nous les transformons en table binaire.
Nous nous basons sur la fonction get_dummies de la librairies Panda pour arriver a nos fins.

Input,
    trainSet : le dataframe complet contenant le jeu de donnees de training avec toutes ses colonnes
    testSet : le dataframe complet contenant le jeu de donnees de training avec toutes ses colonnes
    trainVar : la variable qualitative que nous souhaitons transformer en tableau de variable binaire
    testVar : la variable qualitative que nous souhaitons transformer en tableau de variable binaire

"""

def binerizedQualitativeVariable(trainSet, testSet, trainVar, testVar):
    
    # Generate Binary Table
    trainVar_binary = pd.get_dummies(trainVar)
    testVar_binary = pd.get_dummies(testVar)

    # We concatenate the new columns
    trainSet = pd.concat([trainSet,pd.DataFrame(trainVar_binary)],axis=1)
    testSet = pd.concat([testSet,pd.DataFrame(testVar_binary)],axis=1)

    # Get missing columns in the test set
    missing_categories = set(trainSet) - set(testSet)

    # Add the missing columns in test set with default value equal to 0
    for c in missing_categories:
        testSet[c] = 0

    # Get missing columns in the train set
    missing_categories = set(testSet) - set(trainSet)

    # Add the missing columns in train set with default value equal to 0
    for c in missing_categories:
        trainSet[c] = 0

    # Ensure the order of columns in the test and train sets are the same
    trainSet, testSet = trainSet.align(testSet, axis=1)
    
    # We return the two set
    return trainSet, testSet

In [5]:
# Separons les variables explicatives qualitatives de celles quantitatives
df_train_quanti = df_train.drop(['Name','Platform','Genre','Publisher','Developer','Rating'], axis=1)
df_test_quanti = df_test.drop(['Platform','Genre','Publisher','Developer','Rating'], axis=1)

In [6]:
# Variable Qualitative #1 : Platform
print("Nbr. of categories (train) : %d" %(df_train.Platform.nunique()))
print("Nbr. of categories (test) : %d" %(df_test.Platform.nunique()))

df_train_quanti, df_test_quanti = binerizedQualitativeVariable(df_train_quanti, df_test_quanti, df_train.Platform, df_test.Platform)

Nbr. of categories (train) : 30
Nbr. of categories (test) : 27


In [7]:
# Variable Qualitative #2 : Genre
print("Nbr. of categories (train) : %d" %(df_train.Genre.nunique()))
print("Nbr. of categories (test) : %d" %(df_test.Genre.nunique()))

df_train_quanti, df_test_quanti = binerizedQualitativeVariable(df_train_quanti, df_test_quanti, df_train.Genre, df_test.Genre)

Nbr. of categories (train) : 12
Nbr. of categories (test) : 12


In [8]:
# Variable Qualitative #3 : Publisher
print("Nbr. of categories (train) : %d" %(df_train.Publisher.nunique()))
print("Nbr. of categories (test) : %d" %(df_test.Publisher.nunique()))

df_train_quanti, df_test_quanti = binerizedQualitativeVariable(df_train_quanti, df_test_quanti, df_train.Publisher, df_test.Publisher)

Nbr. of categories (train) : 552
Nbr. of categories (test) : 280


In [9]:
# Variable Qualitative #4 : Developer
print("Nbr. of categories (train) : %d" %(df_train.Developer.nunique()))
print("Nbr. of categories (test) : %d" %(df_test.Developer.nunique()))

"""
Bon ici on a un probleme, si on ajoute les colonnes binaires generees par cette variable on ajoute 100mb
a notre csv. Ceci rend peu pratique le prototypage, pour l'instant on le laisse tomber. On essayera de ce
donner une raison rationnelle de le domper plus tard dans l'analyse. Hypothese, beaucoup de colinearite avec 
le Publisher.
"""
#df_train_quanti, df_test_quanti = binerizedQualitativeVariable(df_train_quanti, df_test_quanti, df_train.Developer, df_test.Developer)

Nbr. of categories (train) : 1593
Nbr. of categories (test) : 677


"\nBon ici on a un probleme, si on ajoute les colonnes binaires generees par cette variable on ajoute 100mb\na notre csv. Ceci rend peu pratique le prototypage, pour l'instant on le laisse tomber. On essayera de ce\ndonner une raison rationnelle de le domper plus tard dans l'analyse. Hypothese, beaucoup de colinearite avec \nle Publisher.\n"

In [10]:
# Variable Qualitative #5 : Rating
print("Nbr. of categories (train) : %d" %(df_train.Rating.nunique()))
print("Nbr. of categories (test) : %d" %(df_test.Rating.nunique()))

df_train_quanti, df_test_quanti = binerizedQualitativeVariable(df_train_quanti, df_test_quanti, df_train.Rating, df_test.Rating)

Nbr. of categories (train) : 8
Nbr. of categories (test) : 5


In [11]:
# On regarde le nombre de NaN par variables explicatives
print("Nombre de NaN sur %d observations \n" % (len(df_train_quanti)))
print(df_train_quanti.isna().sum())

# On garde seulement les observations ou nous avons des donnees (a ameliorer plus tard)
df_train_quanti = df_train_quanti.dropna()

# On calcule le nombre d'observations restantes
print("\nApres clean-up, nombre d'observation : %d" % (len(df_train_quanti)))

Nombre de NaN sur 14094 observations 

10TACLE Studios                   0
1C Company                        0
20th Century Fox Video Games      0
2600                              0
2D Boy                            0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DO                               0
3DS                               0
49Games                           0
505 Games                         0
5pb                               0
7G//AMES                          0
989 Sports               

In [None]:
# On enleve les colonnes qui ne comportent que des zeros
cols_only0 = (df_train_quanti != 0).any(axis=0)
df_train_quanti = df_train_quanti.loc[:,cols_only0]
df_test_quanti = df_test_quanti.loc[:,cols_only0]

# On enleve toutes les colonnes qui sont en double
duplicates = df_train_quanti.T.duplicated()
duplicates = ~duplicates
df_train_quanti = df_train_quanti.T[duplicates].T
df_test_quanti = df_test_quanti.T[duplicates].T


In [None]:
# On sauvegarde nos deux nouveaux jeux de donnees entierement quantitatif
df_train_quanti.to_csv("train_quanti.csv")
df_test_quanti.to_csv("test_quanti.csv")

In [None]:
# Remplacons tous les NaN par des 0

#df_train_quanti.fillna(0)
#df_train_quanti.Year_of_Release.isna().sum()
#df_train_quanti = df_train_quanti[np.isfinite(df_train_quanti.Year_of_Release)]
#df_train_quanti

In [None]:
def linear_regression(y,x):
    
    # Number of Explicative Variables
    k = 0
    try:
        k = x.shape[1]
    except IndexError:
        k = 1
        
    
    # The variance-covariance
    C = np.linalg.inv(np.dot(x.T,x))
    
    B = np.dot(np.dot(C,x.T),y)
    
    return B, np.dot(x,B)

In [None]:
def R2_adj(Y,W,p):

    SS_tot = 0.0
    SS_reg = 0.0
    SS_res = 0.0    
    
    y_S = np.sum(Y)/len(Y)

    for i in range(len(Y)):
        SS_tot += (Y[i] - y_S)**2
        SS_reg += (W[i] - y_S)**2
        SS_res += (Y[i] - W[i])**2
        
    
    n = len(Y)
    
    Radj = 1 - ((SS_res)/float(n-p))/((SS_tot)/(n-1))

    return Radj

In [None]:
# On definit notre variable d'interet
Y = df_train_quanti.NA_Sales

# On definit notre vecteur de variables explicatives
X = np.array(df_train_quanti.drop(['NA_Sales','Global_Sales'], axis=1))
nbrOfVar = X.shape[1]

try:
    b, data = linear_regression(Y,X)
    print(b)
    
except np.linalg.LinAlgError:
    print("NUL, hahaha determinant = 0")
    
