In [1]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split
from sklearn.model_selection import cross_val_score

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from Train_n_test import TrainValidateTest

# Extract and preprocess

In [2]:
t = 3600

# Estimated best parameters!
n_estimators = 64
max_depth = 512

In [3]:
df_training = de.get_dataframe("../input/TRAIN.CSV", training=True, limit_seconds=t)
df_training.info()
df_training.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3052 entries, 0 to 3051
Columns: 8313 entries, id_player to 8310
dtypes: object(8313)
memory usage: 193.6+ MB


Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,8301,8302,8303,8304,8305,8306,8307,8308,8309,8310
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,Base,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,Base,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


In [4]:
df_training_numerical = de.transform_sample(df_training, True)
df_training_numerical.info()
df_training_numerical.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null object
played_race      3052 non-null object
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
hotkey90         3052 non-null int64
s                3052 non-null int64
hotkey11         3052 non-null int64
hotkey80         3052 non-null int64
hotkey82         3052 non-null int64
hot

Unnamed: 0,id_player,played_race,Base,SingleMineral,hotkey00,hotkey02,hotkey10,hotkey12,hotkey20,hotkey22,...,hotkey72,hotkey21,hotkey41,hotkey61,hotkey31,hotkey01,hotkey51,hotkey71,hotkey81,hotkey91
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,66,5,5,41,63,350,37,185,...,0,0,0,0,0,0,0,0,0,0
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,12,0,2,83,65,447,15,215,...,0,0,0,0,0,0,0,0,0,0
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,47,3,3,18,17,130,9,126,...,0,0,0,0,0,0,0,0,0,0
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,21,0,2,38,32,259,14,70,...,0,0,0,0,0,0,0,0,0,0
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,0,1,0,10,52,6,38,...,0,0,0,0,0,0,0,0,0,0


In [5]:
encoder = ThreeFeaturesEncoder(df_training_numerical)
encoded_df_training = encoder.encode_df(df_training_numerical)
encoded_df_training.info()
encoded_df_training.head(50)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null int64
played_race      3052 non-null int64
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
hotkey90         3052 non-null int64
s                3052 non-null int64
hotkey11         3052 non-null int64
hotkey80         3052 non-null int64
hotkey82         3052 non-null int64
hotke

Unnamed: 0,id_player,played_race,Base,SingleMineral,hotkey00,hotkey02,hotkey10,hotkey12,hotkey20,hotkey22,...,hotkey72,hotkey21,hotkey41,hotkey61,hotkey31,hotkey01,hotkey51,hotkey71,hotkey81,hotkey91
0,0,3,66,5,5,41,63,350,37,185,...,3,3,3,3,3,3,3,3,3,3
1,1,3,12,3,2,83,65,447,15,215,...,3,3,3,3,3,3,3,3,3,3
2,0,3,47,3,3,18,17,130,9,126,...,3,3,3,3,3,3,3,3,3,3
3,1,3,21,3,2,38,32,259,14,70,...,3,3,3,3,3,3,3,3,3,3
4,0,3,2,3,1,3,10,52,6,38,...,3,3,3,3,3,3,3,3,3,3
5,1,3,3,3,1,13,20,140,13,54,...,3,3,3,3,3,3,3,3,3,3
6,0,3,2,3,1,3,15,86,10,33,...,3,3,3,3,3,3,3,3,3,3
7,1,3,5,3,3,3,25,159,4,31,...,3,3,3,3,3,3,3,3,3,3
8,2,3,8,3,3,3,24,225,30,111,...,20,3,3,3,3,3,3,3,3,3
9,3,1,3,3,1,3,21,267,9,107,...,3,3,3,3,3,3,3,3,3,3


# Train

In [6]:
X_training = encoded_df_training.iloc[:, 1:]
X_training.info()
X_training.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 34 columns):
played_race      3052 non-null int64
Base             3052 non-null int64
SingleMineral    3052 non-null int64
hotkey00         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey12         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey30         3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey50         3052 non-null int64
hotkey52         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey70         3052 non-null int64
hotkey90         3052 non-null int64
s                3052 non-null int64
hotkey11         3052 non-null int64
hotkey80         3052 non-null int64
hotkey82         3052 non-null int64
hotkey92         3052 non-null int64
hotke

Unnamed: 0,played_race,Base,SingleMineral,hotkey00,hotkey02,hotkey10,hotkey12,hotkey20,hotkey22,hotkey30,...,hotkey72,hotkey21,hotkey41,hotkey61,hotkey31,hotkey01,hotkey51,hotkey71,hotkey81,hotkey91
0,3,66,5,5,41,63,350,37,185,1,...,3,3,3,3,3,3,3,3,3,3
1,3,12,3,2,83,65,447,15,215,7,...,3,3,3,3,3,3,3,3,3,3
2,3,47,3,3,18,17,130,9,126,1,...,3,3,3,3,3,3,3,3,3,3
3,3,21,3,2,38,32,259,14,70,6,...,3,3,3,3,3,3,3,3,3,3
4,3,2,3,1,3,10,52,6,38,1,...,3,3,3,3,3,3,3,3,3,3


In [7]:
y_training = encoded_df_training.id_player.values
print(y_training)

[  0   1   0 ... 199 196 199]


In [8]:
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
model.fit(X_training, y_training)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=512, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# K-fold to see if good results

In [9]:
k = 5  # Warning => 5
scores = cross_val_score(model, X_training, y=y_training, cv=k)
print("scores:", scores)
print("mean of scores:", np.mean(scores))



scores: [0.85238784 0.86412214 0.87272727 0.87152778 0.8952381 ]
mean of scores: 0.8712006253705002


# What are the relevant features ?

In [10]:
relevances = pd.DataFrame({"feature":X_training.columns, 
                          "weight":model.feature_importances_})
relevances.head()

Unnamed: 0,feature,weight
0,played_race,0.027042
1,Base,0.046188
2,SingleMineral,0.025544
3,hotkey00,0.024246
4,hotkey02,0.03846


In [11]:
print("With t=", t, ",n_estimators=", n_estimators, "and max_depth=",max_depth )
relevances.sort_values(by='weight', ascending=False)

With t= 3600 ,n_estimators= 64 and max_depth= 512


Unnamed: 0,feature,weight
14,hotkey52,0.066478
12,hotkey42,0.064261
10,hotkey32,0.062319
16,hotkey62,0.053364
8,hotkey22,0.04935
1,Base,0.046188
5,hotkey10,0.045235
6,hotkey12,0.044433
9,hotkey30,0.043784
7,hotkey20,0.040772
