In [10]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from Train_n_test import TrainValidateTest

## Extraction and preprocesing

In [15]:
limit_seconds = 320
df_training = de.get_dataframe("D:\DATA\Documents\INFO\VideoGames\TRAIN.csv", training=True, limit_seconds=320)
print("------------------------- Raw data --------------------------")
df_training.info()
df_training.head()

------------------------- Raw data --------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3052 entries, 0 to 3051
Columns: 1876 entries, id_player to 1873
dtypes: object(1876)
memory usage: 43.7+ MB


Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,Base,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,Base,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [16]:
df_training_numerical = de.transform_sample(df_training, True)
print("------------------------- (Encoded) Data in the counts version --------------------------")
df_training_numerical.info()
df_training_numerical.head()

------------------------- (Encoded) Data in the counts version --------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null object
played_race      3052 non-null object
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
s                3052 non-null int64
hotkey80         3052 non-null int64
hotkey72         3052 non-null int64
hotkey02         3052 no

Unnamed: 0,id_player,played_race,Base,SingleMineral,hotkey00,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,...,hotkey41,hotkey21,hotkey82,hotkey51,hotkey31,hotkey61,hotkey71,hotkey81,hotkey91,hotkey01
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,3,4,1,4,32,3,9,1,...,0,0,0,0,0,0,0,0,0,0
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,4,0,0,6,61,4,23,3,...,0,0,0,0,0,0,0,0,0,0
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,2,1,3,23,3,6,1,...,0,0,0,0,0,0,0,0,0,0
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,4,0,0,4,63,3,16,3,...,0,0,0,0,0,0,0,0,0,0
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,0,1,4,22,1,12,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
encoder = ThreeFeaturesEncoder(df_training_numerical)
encoded_df_training = encoder.encode_df(df_training_numerical)
print("------------------------- Encoded data --------------------------")
encoded_df_training.info()
encoded_df_training.head(50)
# Bizarre que l'indexing des joueurs se fait de 1 à 3 là. Quoique ça ne change pas les résultats.

------------------------- Encoded data --------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null int64
played_race      3052 non-null int64
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
s                3052 non-null int64
hotkey80         3052 non-null int64
hotkey72         3052 non-null int64
hotkey02         3052 non-null int64
hotkey90     

Unnamed: 0,id_player,played_race,Base,SingleMineral,hotkey00,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,...,hotkey41,hotkey21,hotkey82,hotkey51,hotkey31,hotkey61,hotkey71,hotkey81,hotkey91,hotkey01
0,0,3,3,4,1,4,32,3,9,1,...,3,3,3,3,3,3,3,3,3,3
1,1,3,4,3,3,6,61,4,23,3,...,3,3,3,3,3,3,3,3,3,3
2,0,3,2,2,1,3,23,3,6,1,...,3,3,3,3,3,3,3,3,3,3
3,1,3,4,3,3,4,63,3,16,3,...,3,3,3,3,3,3,3,3,3,3
4,0,3,2,3,1,4,22,1,12,1,...,3,3,3,3,3,3,3,3,3,3
5,1,3,3,3,3,6,57,4,24,3,...,3,3,3,3,3,3,3,3,3,3
6,0,3,2,3,1,3,19,4,7,1,...,3,3,3,3,3,3,3,3,3,3
7,1,3,5,3,3,6,67,2,21,3,...,3,3,3,3,3,3,3,3,3,3
8,2,3,6,3,3,3,23,27,87,1,...,3,3,3,3,3,3,3,3,3,3
9,3,1,3,3,1,8,203,2,73,1,...,3,3,3,3,3,3,3,3,3,3


## Training and testing with k-fold

In [18]:
X_training = encoded_df_training.iloc[:, 1:]
X_training.info()
X_training.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 34 columns):
played_race      3052 non-null int64
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
s                3052 non-null int64
hotkey80         3052 non-null int64
hotkey72         3052 non-null int64
hotkey02         3052 non-null int64
hotkey90         3052 non-null int64
hotkey92         3052 non-null int64
hotkey11         3052 non-null int64
hotke

Unnamed: 0,played_race,Base,SingleMineral,hotkey00,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,hotkey32,...,hotkey41,hotkey21,hotkey82,hotkey51,hotkey31,hotkey61,hotkey71,hotkey81,hotkey91,hotkey01
0,3,3,4,1,4,32,3,9,1,151,...,3,3,3,3,3,3,3,3,3,3
1,3,4,3,3,6,61,4,23,3,144,...,3,3,3,3,3,3,3,3,3,3
2,3,2,2,1,3,23,3,6,1,172,...,3,3,3,3,3,3,3,3,3,3
3,3,4,3,3,4,63,3,16,3,121,...,3,3,3,3,3,3,3,3,3,3
4,3,2,3,1,4,22,1,12,1,157,...,3,3,3,3,3,3,3,3,3,3


In [19]:
y_training = encoded_df_training.id_player.values
print(y_training)

[  0   1   0 ... 199 196 199]


In [20]:
model = RandomForestClassifier(n_estimators=256, max_depth=512)

In [23]:
#Neural Network version
model = MLPClassifier(solver='adam')
scaler = StandardScaler()
scaler.fit(X_training)  
X_training = scaler.transform(X_training) 

In [24]:
k = 5
scores = cross_val_score(model, X_training, y=y_training, cv=k)
scores



array([0.85817656, 0.8778626 , 0.90082645, 0.89583333, 0.9047619 ])