In [1]:
def PCA(Y,X,varExplained):
        
    # On transforme X en une matrice de moyenne = 0 et variance = 1
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    nbrOfVar = X.shape[1]

    # On calcule la matrice de covariance
    E = np.dot(X_std.T,X_std)/(nbrOfVar-1)

    # On calcule nos Valeurs et Vecteurs propres
    eig_val, eig_vec = np.linalg.eig(E)
    eig_val = eig_val.real
    eig_vec = eig_vec.T # pour les ordonner par range

    # On calcule la contribution de chaque valeur propre a la variance total
    eig_val_contribution = eig_val/np.sum(eig_val)

    # On cree une liste d'index de 0 au nombre de valeurs propres
    rangeIndex = np.array(range(len(eig_val_contribution)))

    # On cree un tableau indexe de nos valeurs propres et leur contribution a la variance
    eig_val_table = np.array(([rangeIndex,eig_val,eig_val_contribution])).T

    # On ordonne en ordre decroissant de contribution le tableau
    eig_val_table_ordered = eig_val_table[(-eig_val_table[:,2]).argsort()]
    
    # On rejoint la quantite de variance souhaitee
    varCumul = 0.0
    varIndex = 0
    for val in eig_val_table_ordered:
        varIndex += 1
        varCumul += val[2]
        if(varCumul >= varExplained):
            break
    
    # On recupere les index des valeurs propres que nous garderont
    indexKept = eig_val_table_ordered[:,0][:varIndex]

    # Our reduced regressor
    K_vect = eig_vec[indexKept.astype(int),:].T

    # Our principal components
    Z = np.dot(X_std,K_vect)

    # Our Regression Coefficiants
    B_partial = np.dot(np.linalg.inv(np.dot(Z.T,Z)),Z.T)
    B = np.dot(B_partial,Y)

    # We calculate the intercept
    uY = np.mean(Y)

    # We add the intercept
    Y_new = np.dot(Z,B) + uY

    # Clip to min 0
    Y_new = Y_new.clip(min=0)
    
    return Y_new, B, K_vect, uY


In [2]:
def PCA_trained(X,B,K_vect,uY):
    
    X_std = StandardScaler().fit_transform(X)
    
    # Our principal components
    Z = np.dot(X_std,K_vect)
    
    # We add the intercept
    Y_new = np.dot(Z,B) + uY
    
    # Clip to min 0
    Y_new = Y_new.clip(min=0)

    return Y_new
    

In [3]:
def estimateCol(trainSet,testSet,colName,varExplained):
    
    # On definit notre variable d'interet
    Y = np.array(trainSet[colName])

    # On definit notre vecteur de variables explicatives
    X = np.array(trainSet.drop([colName], axis=1))

    # We train our model
    [Y_new, B, K_vect, uY] = PCA(Y,X,varExplained)
    
    X_test = np.array(testSet.drop([colName], axis=1))    
    Y_test = PCA_trained(X_test,B,K_vect,uY)
    
    return Y_test

In [4]:
df_test_estimated = pd.DataFrame([df_test.NA_Sales,df_test.Global_Sales]).copy().T


df_test_estimated['NA_Sales'] = estimateCol(df_train,df_test,'NA_Sales',0.999)
df_test_estimated['Global_Sales'] = estimateCol(df_train,df_test,'Global_Sales',0.999)

# On definit notre vecteur de variables explicatives
X1 = np.array(trainSet.drop(['Global_Sales'], axis=1))
X2 = np.array(trainSet.drop(['NA_Sales'], axis=1))

Y1 = df_test_estimated.Global_Sales.values
Y2 = df_test_estimated.NA_Sales.values

print("Global R2_prev : %.3f" % (R2_prev(Y1,np.dot(X1,B1),X)))
print("NA R2_prev : %.3f" % (R2_prev(Y2,np.dot(X2,B2),X)))

NameError: name 'pd' is not defined

In [1]:
"""

Fonction qui affiche la valeur moyenne de la variable d'interet en fonction des 
valeurs d'une variable explicative qualitative.

Arguments:
    dataset : Le jeu de donnees sur lequel nous appliquons la fonction
    Y : la variable d'interet
    colName : Le nom de la variable qualitative que nous souhaitons etudier


"""

def clusterVar(dataset,Y,colsName,minNbr=0):
    
    # On selectionne seulement les colonnes qui debute par 'colsName'
    reducedSet = dataset.loc[:, dataset.columns.str.startswith(colsName)]
    
    # On calcule le nombre de colonne
    nbrOfCols = len(reducedSet.T)
    
    # On cree une liste d'index de 0 au nombre de colonne
    rangeIndex = np.array(range(nbrOfCols))
    
    sumWithX = np.zeros(nbrOfCols)
    valueCount = np.zeros(nbrOfCols)
    names = np.array(['' for _ in range(nbrOfCols)], dtype=object)
    index = 0
    for col in reducedSet:
        withX = reducedSet[col]*Y
        sumWithX[index] = np.sum(withX)/(withX.astype(bool).sum(axis=0))
        if(np.isnan(sumWithX[index])):
            sumWithX[index] = 0
        valueCount[index] = reducedSet[col].astype(bool).sum(axis=0)
        names[index] += col
        index += 1
    
    # On cree un tableau indexe de nos categories, leur quantite et leur valeur moyenne de la variable d'interet
    aveY_table = np.array(([rangeIndex,sumWithX,valueCount,names])).T
    
    # On ordonne en ordre decroissant de contribution le tableau
    aveY_table_ordered = aveY_table[(-aveY_table[:,1]).argsort()]
    
    print('Index \t Moy \t Nbr \t Name')    
    print('------------------------------------------')
    for i in range(nbrOfCols):
        if(aveY_table_ordered[i][2] > minNbr):
            print('%d. \t %.3f \t %d \t %s ' % (aveY_table_ordered[i][0],aveY_table_ordered[i][1],aveY_table_ordered[i][2],aveY_table_ordered[i][3]))
            
     
    newIndex = 0
    print('\n\n\n')    
    for i in range(nbrOfCols):
        if(aveY_table_ordered[i][2] > minNbr):
            newIndex += 1
            print("X12_%d = testSet['%s'].values" % (newIndex,aveY_table_ordered[i][3]))    

In [2]:
"""

Regression lineaire par methode des moindres carres 

Arguments :

    y : Variable d'interet (dimension : n x 1)
    x : Matrice contenant nos variables explicatives (dimension : n x (p+1))
    
    ou,
        n : nombre d'observation
        p : nombre de variable explicative
    

Retourne : 

    B = Matrices de nos coefficiants de regression (dimension (p+1) x 1)

    Pour estimer la valeur de y_new avec de nouvelles observations (i.e. x_new) on a,
        y_new = np.dot(x_new,B)

"""

def linear_regression(y,x):
    
    # On calcule la matrice de variance-covariance
    C = np.linalg.inv(np.dot(x.T,x))
    
    # On calcule les coefficients de regression
    B = np.dot(np.dot(C,x.T),y)
    
    return B

In [1]:
"""

Ici on a une fonction pour nous aider a visualiser les coefficiants de regression.

Puisque notre matrice de variables explicatives est normalisee, un haut coefficiant b_i indique 
une importance de la variable X_i

Par contre nous ne testons pas ici des relations du type : log(X_i), X_i^2, X_(i+1)*X_i, etc.

"""

def cofficientTrier(dataset,Y,l,coeffThreshold):
    
    # On cree la matrice de nos variables explicatives
    X = dataset.drop(['NA_Sales','Global_Sales'],axis=1)
    
    # On enleve les colonnes qui comportent que des 0
    X = X.loc[:, (X != 0).any(axis=0)]
    
    # Transformation, moy = 0, var = 1
    X_std = scaler(X.values)
    
    # On effectue la regression
    B = ridge_regression(Y,X_std,l)
    
    # On calcule la somme des B
    sumB = np.sum(np.abs(B))    
    
    # On calcule le nombre de colonne
    nbrOfCols = len(X.T)
    
    
    indexes = np.zeros(nbrOfCols)
    coeffs = np.zeros(nbrOfCols)
    coeffsFrac = np.zeros(nbrOfCols)
    sumBFrac = 0
    names = np.array(['' for _ in range(nbrOfCols)], dtype=object)
    index = 0
    index2 = 0
    for col in X:
        if((B[index]/sumB) >= coeffThreshold):
            names[index2] += col
            indexes[index2] = index
            coeffs[index2] = B[index]
            coeffsFrac[index2] = np.abs(B[index])/sumB
            sumBFrac += coeffsFrac[index2]
            index2 += 1
        
        index += 1
    
    # On cree un tableau indexe de nos categories, leur quantite et leur valeur moyenne de la variable d'interet
    table = np.array(([indexes,coeffsFrac,names])).T
    
    # On enleve les ranges de 0
    table = table[~np.all(table == 0, axis=1)]
    
    # On ordonne en ordre decroissant de contribution le tableau
    table_ordered = table[(-table[:,1]).argsort()] 
    
    print("Fraction of B displayed = %.3f %%" % sumBFrac)
    print('Index \t B_i %% \t Name \t (l = %.2f  -  minB = %.3f %%)' % (l,coeffThreshold))    
    print('------------------------------------------')
    for i in range(index2):
        sumBFrac += table_ordered[i][1]
        print('%d. \t %.3f \t %s ' % (table_ordered[i][0],table_ordered[i][1],table_ordered[i][2]))
    
    """ SI ON VEUT COPIER COLLER LE NOM DES VAR ON ENLEVE COMMENTAIRE
    
    
    
    """
    
    print('\n')
    
    for i in range(index2):
        print("X%d = dataset['%s'].values" % (i,table_ordered[i][2]))

    print('\n')
        
    for i in range(index2):
        print("X%d," % i)
           

In [2]:
def createX_Global(dataset):
    
    # On dresse la liste des variables explicatives qu'on souhaite inclure
    # Variables Quantitatives
    
    
    # Platform
    X8_4 = dataset.Platform_SNES.values
    X8_5 = dataset.Platform_X360.values
    X8_7 = dataset.Platform_2600.values
    X8_8 = dataset.Platform_Wii.values
    X8_10 = dataset.Platform_N64.values
    X8_11 = dataset.Platform_XOne.values
    X8_17 = dataset.Platform_GBA.values
    X8_18 = dataset.Platform_GC.values
    X8_19 = dataset.Platform_DS.values
    X8_20 = dataset.Platform_XB.values
    X8_25 = dataset.Platform_SAT.values


    # Genre    
    X9_1 = dataset.Genre_Platform.values
    X9_2 = dataset.Genre_Shooter.values
    X9_3 = dataset.Genre_Racing.values
    X9_4 = dataset.Genre_Sports.values
    X9_5 = dataset['Genre_Role-Playing'].values
    X9_11 = dataset.Genre_Strategy.values
    
    # Rating
    X10_4 = dataset.Rating_E.values
    
    # Publisher
    X11_1 = dataset['Publisher_Nintendo'].values
    X11_2 = dataset['Publisher_Microsoft Game Studios'].values
    X11_6 = dataset['Publisher_Take-Two Interactive'].values
    X11_8 = dataset['Publisher_Electronic Arts'].values
    X11_9 = dataset['Publisher_Activision'].values
    X11_10 = dataset['Publisher_Warner Bros. Interactive Entertainment'].values
    X11_15 = dataset['Publisher_Disney Interactive Studios'].values
    X11_18 = dataset['Publisher_Ubisoft'].values
    X11_19 = dataset['Publisher_THQ'].values
    X11_25 = dataset['Publisher_Konami Digital Entertainment'].values
    
    # Developer
    #X12_20 = dataset['Developer_Ubisoft'].values
    
    X13 = dataset['Complete_Data'].values   
    
    X1 = dataset.JP_Sales.values
    X2 = dataset.Critic_Count.values
    X3 = dataset.Critic_Score.values
    X4 = dataset.Year_of_Release.values
    X5 = dataset.Other_Sales.values
    X6 = dataset.User_Score.values
    X7 = dataset.User_Count.values
    
    
    # Par tattonnement on arrive a
    X = np.array([X1,X1**2,X1*X5,X1*X4,X5*X4,X5**2,X4,X2,X3,X6,X7,X7**2,
                  X8_4,
                  X8_5,X8_7+X8_8+X8_10+X8_11,X8_17+X8_18+X8_19,X8_20,
                  X8_25,
                  X9_1,X9_2,X9_3+X9_4,X9_5,X9_11,
                  X10_4,
                  X11_1+X11_2,X11_6+X11_8+X11_9+X11_10+X11_15,X11_18+X11_19,X11_25,
                  #X12_20,
                  X13]).T
    
    
    # On transforme pour moyenne = 0 et variance = 1
    X_std = scaler(X)
    
    # On ajoute une colonne de 1
    Ones = np.array([np.ones(len(dataset))]).T
    X_std = np.concatenate((Ones,X_std),axis = 1)
    
    return X_std

In [None]:
def createX_NA(dataset):
    
    # Variables Quantitatives
    X1 = dataset.JP_Sales.values
    X2 = dataset.Critic_Count.values
    X3 = dataset.Critic_Score.values
    X4 = dataset.Year_of_Release.values
    X5 = dataset.Other_Sales.values
    X6 = dataset.User_Score.values
    X7 = dataset.User_Count.values
    
    # Platform
    X8_4 = dataset.Platform_2600.values
    X8_5 = dataset.Platform_X360.values
    X8_6 = dataset.Platform_N64.values
    X8_7 = dataset.Platform_Wii.values
    X8_9 = dataset.Platform_XOne.values
    X8_11 = dataset.Platform_SNES.values
    X8_16 = dataset.Platform_GC.values
    X8_18 = dataset.Platform_XB.values
    X8_26 = dataset.Platform_SAT.values

    # Genre    
    X9_1 = dataset.Genre_Platform.values
    X9_9 = dataset['Genre_Role-Playing'].values
    
    # Publisher
    X11_1 = dataset['Publisher_Nintendo'].values
    X11_2 = dataset['Publisher_Microsoft Game Studios'].values
    X11_7 = dataset['Publisher_Activision'].values
    X11_11 = dataset['Publisher_Warner Bros. Interactive Entertainment'].values
    
    
    X12 = dataset['Complete_Data'].values  

    X = np.array([X1**2,X1,X1*X5,X5,X1**2*X4**2,X4,X5**2,(X3**9)*X2,
                  X8_4+X8_5+X8_6+X8_7+X8_9,X8_11,X8_16+X8_18,X8_26,
                  X9_1,X9_9,
                  X11_1+X11_2,X11_7+X11_11,
                  X12]).T
    
    # On transforme pour moyenne = 0 et variance = 1
    X_std = scaler(X)
    
    # On ajoute une colonne de 1
    Ones = np.array([np.ones(len(dataset.JP_Sales))]).T
    
    X_std = np.concatenate((Ones,X_std),axis = 1)
    
    return X_std