In [1]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split
from sklearn.model_selection import cross_val_score

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from Train_n_test import TrainValidateTest

## Extraction and preprocesing

In [2]:
limit_seconds = 320
df_training = de.get_dataframe("../input/minitrain.CSV", training=True, limit_seconds=320)
print("------------------------- Raw data --------------------------")
df_training.info()
df_training.head()

------------------------- Raw data --------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2841 entries, 0 to 2840
Columns: 1876 entries, id_player to 1873
dtypes: object(1876)
memory usage: 40.7+ MB


Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873
0,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,s,hotkey30,s,hotkey32,s,hotkey32,s,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,Base,s,s,s,s,hotkey30,Base,s,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,s,hotkey30,s,hotkey32,SingleMineral,s,s,hotkey32,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3973341/1/...,Protoss,Base,s,s,Base,s,hotkey20,hotkey30,hotkey60,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,Base,s,s,Base,s,s,s,hotkey30,...,,,,,,,,,,


In [3]:
df_training_numerical = de.transform_sample(df_training, True)
print("------------------------- (Encoded) Data in the counts version --------------------------")
df_training_numerical.info()
df_training_numerical.head()

------------------------- (Encoded) Data in the counts version --------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841 entries, 0 to 2840
Data columns (total 35 columns):
id_player        2841 non-null object
played_race      2841 non-null object
SingleMineral    2841 non-null int64
hotkey10         2841 non-null int64
hotkey12         2841 non-null int64
hotkey20         2841 non-null int64
hotkey22         2841 non-null int64
hotkey30         2841 non-null int64
hotkey32         2841 non-null int64
hotkey40         2841 non-null int64
hotkey42         2841 non-null int64
s                2841 non-null int64
Base             2841 non-null int64
hotkey50         2841 non-null int64
hotkey52         2841 non-null int64
hotkey60         2841 non-null int64
hotkey62         2841 non-null int64
hotkey70         2841 non-null int64
hotkey72         2841 non-null int64
hotkey80         2841 non-null int64
hotkey82         2841 non-null int64
hotkey00         2841 no

Unnamed: 0,id_player,played_race,SingleMineral,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,hotkey32,hotkey40,...,hotkey92,hotkey41,hotkey31,hotkey61,hotkey11,hotkey21,hotkey51,hotkey71,hotkey91,hotkey01
0,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,53,1,71,4,227,2,438,3,...,0,0,0,0,0,0,0,0,0,0
1,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,0,0,0,0,0,3,68,2,...,0,0,0,0,0,0,0,0,0,0
2,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,18,1,83,1,239,2,375,2,...,0,0,0,0,0,0,0,0,0,0
3,http://eu.battle.net/sc2/en/profile/3973341/1/...,Protoss,0,2,101,11,125,15,91,1,...,0,0,0,0,0,0,0,0,0,0
4,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,0,1,8,1,0,3,56,2,...,0,0,0,0,0,0,0,0,0,0


In [9]:
encoder = ThreeFeaturesEncoder(df_training_numerical)
encoded_df_training = encoder.encode_df(df_training_numerical)
print("------------------------- Encoded data --------------------------")
encoded_df_training.info()
encoded_df_training.head(50)
# Bizarre que l'indexing des joueurs se fait de 1 à 3 là. Quoique ça ne change pas les résultats.

------------------------- Encoded data --------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841 entries, 0 to 2840
Data columns (total 35 columns):
id_player        2841 non-null int64
played_race      2841 non-null int64
SingleMineral    2841 non-null int64
hotkey10         2841 non-null int64
hotkey12         2841 non-null int64
hotkey20         2841 non-null int64
hotkey22         2841 non-null int64
hotkey30         2841 non-null int64
hotkey32         2841 non-null int64
hotkey40         2841 non-null int64
hotkey42         2841 non-null int64
s                2841 non-null int64
Base             2841 non-null int64
hotkey50         2841 non-null int64
hotkey52         2841 non-null int64
hotkey60         2841 non-null int64
hotkey62         2841 non-null int64
hotkey70         2841 non-null int64
hotkey72         2841 non-null int64
hotkey80         2841 non-null int64
hotkey82         2841 non-null int64
hotkey00         2841 non-null int64
hotkey02     

Unnamed: 0,id_player,played_race,SingleMineral,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,hotkey32,hotkey40,...,hotkey92,hotkey41,hotkey31,hotkey61,hotkey11,hotkey21,hotkey51,hotkey71,hotkey91,hotkey01
0,0,3,53,1,71,4,227,2,438,3,...,3,3,3,3,3,3,3,3,3,3
1,1,1,3,3,3,3,3,3,68,2,...,3,3,3,3,3,3,3,3,3,3
2,0,3,18,1,83,1,239,2,375,2,...,3,3,3,3,3,3,3,3,3,3
3,2,1,3,2,101,11,125,15,91,1,...,3,3,3,3,3,3,3,3,3,3
4,1,1,3,1,8,1,3,3,56,2,...,3,3,3,3,3,3,3,3,3,3
5,2,1,3,1,105,23,186,17,30,1,...,3,3,3,3,3,3,3,3,3,3
6,1,1,3,3,21,3,3,2,89,3,...,3,3,3,3,3,3,3,3,3,3
7,2,1,3,1,116,11,105,15,81,2,...,3,3,3,3,3,3,3,3,3,3
8,1,1,3,1,8,3,3,4,26,2,...,3,3,3,3,3,3,3,3,3,3
9,2,1,3,1,82,6,152,12,69,2,...,3,3,3,3,3,3,3,3,3,3


## Training and testing with k-fold

In [5]:
X_training = encoded_df_training.iloc[:, 1:]
X_training.info()
X_training.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841 entries, 0 to 2840
Data columns (total 34 columns):
played_race      2841 non-null int64
SingleMineral    2841 non-null int64
hotkey10         2841 non-null int64
hotkey12         2841 non-null int64
hotkey20         2841 non-null int64
hotkey22         2841 non-null int64
hotkey30         2841 non-null int64
hotkey32         2841 non-null int64
hotkey40         2841 non-null int64
hotkey42         2841 non-null int64
s                2841 non-null int64
Base             2841 non-null int64
hotkey50         2841 non-null int64
hotkey52         2841 non-null int64
hotkey60         2841 non-null int64
hotkey62         2841 non-null int64
hotkey70         2841 non-null int64
hotkey72         2841 non-null int64
hotkey80         2841 non-null int64
hotkey82         2841 non-null int64
hotkey00         2841 non-null int64
hotkey02         2841 non-null int64
hotkey81         2841 non-null int64
hotkey90         2841 non-null int64
hotke

Unnamed: 0,played_race,SingleMineral,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,hotkey32,hotkey40,hotkey42,...,hotkey92,hotkey41,hotkey31,hotkey61,hotkey11,hotkey21,hotkey51,hotkey71,hotkey91,hotkey01
0,3,53,1,71,4,227,2,438,3,216,...,3,3,3,3,3,3,3,3,3,3
1,1,3,3,3,3,3,3,68,2,108,...,3,3,3,3,3,3,3,3,3,3
2,3,18,1,83,1,239,2,375,2,207,...,3,3,3,3,3,3,3,3,3,3
3,1,3,2,101,11,125,15,91,1,7,...,3,3,3,3,3,3,3,3,3,3
4,1,3,1,8,1,3,3,56,2,144,...,3,3,3,3,3,3,3,3,3,3


In [6]:
y_training = encoded_df_training.id_player.values
print(y_training)

[  0   1   0 ... 199 196 199]


In [7]:
model = RandomForestClassifier(n_estimators=256, max_depth=512)

In [8]:
k = 5
scores = cross_val_score(model, X_training, y=y_training, cv=k)
scores



array([0.8622291 , 0.89215686, 0.90989399, 0.91992551, 0.9125    ])