In [13]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split
from sklearn.model_selection import cross_val_score

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from Train_n_test import TrainValidateTest

## Extraction and preprocesing

In [7]:
limit_seconds = 320
df_training = de.get_dataframe("../input/minitrain.CSV", training=True, limit_seconds=320)
print("------------------------- Raw data --------------------------")
df_training.info()
df_training.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2841 entries, 0 to 2840
Columns: 1876 entries, id_player to 1873
dtypes: object(1876)
memory usage: 40.7+ MB


Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873
0,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,s,hotkey30,s,hotkey32,s,hotkey32,s,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,Base,s,s,s,s,hotkey30,Base,s,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/1143713/1/...,Terran,s,hotkey30,s,hotkey32,SingleMineral,s,s,hotkey32,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3973341/1/...,Protoss,Base,s,s,Base,s,hotkey20,hotkey30,hotkey60,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/3368730/1/...,Protoss,Base,s,s,Base,s,s,s,hotkey30,...,,,,,,,,,,


In [5]:
encoder = ThreeFeaturesEncoder(df_training)
encoded_df_training = encoder.encode_df(df_training)
print("------------------------- Encoded data --------------------------")
encoded_df_training.info()
encoded_df_training.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2841 entries, 0 to 2840
Columns: 1876 entries, id_player to 1873
dtypes: int64(1876)
memory usage: 40.7 MB


Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873
0,0,0,32,20,32,12,32,12,32,32,...,3,3,3,3,3,3,3,3,3,3
1,1,1,13,32,32,32,32,20,13,32,...,3,3,3,3,3,3,3,3,3,3
2,0,0,32,20,32,12,9,32,32,12,...,3,3,3,3,3,3,3,3,3,3
3,2,1,13,32,32,13,32,1,20,19,...,3,3,3,3,3,3,3,3,3,3
4,1,1,13,32,32,13,32,32,32,20,...,3,3,3,3,3,3,3,3,3,3


In [6]:
df_training_numerical = de.transform_sample(encoded_df_training, True)
print("------------------------- (Encoded) Data in the counts version --------------------------")
df_training_numerical.info()
df_training_numerical.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841 entries, 0 to 2840
Data columns (total 35 columns):
id_player      2841 non-null int64
played_race    2841 non-null int64
1              2841 non-null float64
3              2841 non-null float64
4              2841 non-null float64
9              2841 non-null float64
12             2841 non-null float64
16             2841 non-null float64
20             2841 non-null float64
22             2841 non-null float64
24             2841 non-null float64
28             2841 non-null float64
32             2841 non-null float64
10             2841 non-null float64
13             2841 non-null float64
15             2841 non-null float64
0              2841 non-null float64
6              2841 non-null float64
7              2841 non-null float64
19             2841 non-null float64
26             2841 non-null float64
27             2841 non-null float64
5              2841 non-null float64
29             2841 non-null float64
2        

Unnamed: 0,id_player,played_race,1,3,4,9,12,16,20,22,...,11,25,21,17,8,14,18,30,31,23
0,0,0,4.0,493.0,71.0,53.0,438.0,1.0,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0.0,1376.0,0.0,0.0,68.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1.0,788.0,83.0,18.0,375.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1,11.0,866.0,101.0,0.0,91.0,2.0,15.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,1.0,1239.0,8.0,0.0,56.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training and testing with k-fold

In [8]:
X_training = df_training_numerical.iloc[:, 1:]
X_training.info()
X_training.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2841 entries, 0 to 2840
Data columns (total 34 columns):
played_race    2841 non-null int64
1              2841 non-null float64
3              2841 non-null float64
4              2841 non-null float64
9              2841 non-null float64
12             2841 non-null float64
16             2841 non-null float64
20             2841 non-null float64
22             2841 non-null float64
24             2841 non-null float64
28             2841 non-null float64
32             2841 non-null float64
10             2841 non-null float64
13             2841 non-null float64
15             2841 non-null float64
0              2841 non-null float64
6              2841 non-null float64
7              2841 non-null float64
19             2841 non-null float64
26             2841 non-null float64
27             2841 non-null float64
5              2841 non-null float64
29             2841 non-null float64
2              2841 non-null float64
11     

Unnamed: 0,played_race,1,3,4,9,12,16,20,22,24,...,11,25,21,17,8,14,18,30,31,23
0,0,4.0,493.0,71.0,53.0,438.0,1.0,2.0,3.0,227.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,1376.0,0.0,0.0,68.0,0.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,788.0,83.0,18.0,375.0,1.0,2.0,2.0,239.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,11.0,866.0,101.0,0.0,91.0,2.0,15.0,1.0,125.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1.0,1239.0,8.0,0.0,56.0,1.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
y_training = df_training_numerical.id_player.values
print(y_training)

[  0   1   0 ... 199 196 199]


In [15]:
model = RandomForestClassifier(n_estimators=256, max_depth=512)

In [16]:
k = 5
scores = cross_val_score(model, X_training, y=y_training, cv=k)
scores



array([0.87616099, 0.89052288, 0.91872792, 0.9255121 , 0.9125    ])

# Mini-parameter tuning
As in the case of our fixed cross-validation, we take some candidates values for the parameters (t, n_estimators, max-depth), and we test.

**TODO**