In [1]:
import h5py 
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold

# Extract Data

In [2]:
f = h5py.File('SG24_dataset.h5', 'r')
f.keys()

<KeysViewHDF5 ['Predictors', 'Target', 'User']>

In [3]:
tmp = f['Predictors']
predictors = np.zeros(tmp.shape)
tmp.read_direct(predictors)
predictors = predictors.T

tmp = f['Target']
target = np.zeros(tmp.shape)
tmp.read_direct(target)
target = target[0]

tmp = f['User']
user = np.zeros(tmp.shape)
tmp.read_direct(user)
user = user.T

print(predictors.shape)
print(target.shape)
print(user.shape)

(2400, 29)
(2400,)
(2400, 1)


# Simple Describe

In [5]:
pd.DataFrame(predictors).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,...,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0,2400.0
mean,53.669293,-5.698329,0.203902,0.491011,-0.140472,0.164828,-0.14961,129.851667,137.50375,145.390833,...,110.054167,82.854167,103.005417,91.606667,143.834167,109.237083,120.66875,82.5625,137.099167,83.0475
std,41.520634,32.720964,22.393903,0.249192,0.281654,0.690302,0.268272,32.298842,15.895186,31.776834,...,46.031646,20.323427,45.260604,22.685341,53.253552,17.291823,29.802893,48.056139,21.926745,30.394479
min,0.0,-81.559349,-68.739662,0.0,-0.855817,-1.0,-0.789427,1.0,44.0,36.0,...,0.0,1.0,1.0,25.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,0.0,-26.450032,-15.801244,0.314152,-0.387735,-0.846747,-0.360048,122.0,128.0,121.75,...,63.0,71.0,79.0,75.0,92.0,105.0,104.0,49.0,131.0,64.0
50%,70.621365,0.0,0.0,0.555695,0.0,0.471377,-0.062433,128.0,135.0,152.0,...,132.0,81.0,114.0,92.0,159.0,106.0,126.0,86.0,141.0,78.0
75%,84.528341,0.0,17.166466,0.691598,0.0,0.629917,0.0,134.0,143.0,170.0,...,151.0,93.0,134.0,111.0,192.0,107.0,142.0,116.0,149.0,101.0
max,127.721718,78.185921,59.903419,0.955188,0.727471,1.0,0.78239,238.0,206.0,207.0,...,179.0,170.0,218.0,229.0,248.0,255.0,189.0,216.0,192.0,193.0


In [24]:
Counter(list(target))

Counter({1.0: 100,
         2.0: 100,
         3.0: 100,
         4.0: 100,
         5.0: 100,
         6.0: 100,
         7.0: 100,
         8.0: 100,
         9.0: 100,
         10.0: 100,
         11.0: 100,
         12.0: 100,
         13.0: 100,
         14.0: 100,
         15.0: 100,
         16.0: 100,
         17.0: 100,
         18.0: 100,
         19.0: 100,
         20.0: 100,
         21.0: 100,
         22.0: 100,
         23.0: 100,
         24.0: 100})

In [25]:
Counter(list(user.T[0]))

Counter({1.0: 600,
         2.0: 600,
         3.0: 360,
         4.0: 240,
         5.0: 240,
         6.0: 120,
         7.0: 120,
         8.0: 120})

# Choix des modèles et des hyper paramètres à tester

In [6]:
# List of models with hyper parameter to test
models = []

# Perceptron
percep = Perceptron()
percep_params = {'alpha' : [0.001, 0.0001]}
models.append((percep, percep_params))

# Knn
knn = KNeighborsClassifier()
knn_params = {'n_neighbors':[3, 5]}
models.append((knn, knn_params))

# Decoupage en folds

In [7]:
#kfold = KFold(n_splits=5, shuffle=True)
kfold = StratifiedKFold(n_splits=5, shuffle=True) #Choose statified Kfold

# Application des gridSearchs et Récupérations du meilleur modèle

In [8]:
bestModel = None
bestScore = -np.inf

for model, params in models:
    currentBest = GridSearchCV(model, params, cv=kfold)
    currentBest.fit(predictors, target)
    
    if(currentBest.best_score_>bestScore):
        bestScore = currentBest.best_score_
        bestModel = currentBest.best_estimator_

# Utilisation du meilleur estimator

In [11]:
bestModel.fit(predictors, target)# Train the best model & hyper parameters with all available data (TO CONFIRM)

KNeighborsClassifier(n_neighbors=3)

In [12]:
bestModel.predict(predictors) #Test sur les données d'apprentissage

array([ 1.,  1.,  1., ...,  5., 24.,  7.])