In [1]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from Train_n_test import TrainValidateTest

# Prepare train and test original datasets (+ labels)

In [2]:
# Extract dataset: train and validation and test (though validation and test are done independntly)
df_training = de.get_dataframe("../input/TRAIN.csv", training=True, limit_seconds=8) # OK
df_validation = de.get_dataframe("../input/minitest.csv", training=True, limit_seconds=8) # OK
df_testing = de.get_dataframe("../input/TEST.CSV", training=False, limit_seconds=8)

In [3]:
df_training.info()
df_training.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3052 entries, 0 to 3051
Data columns (total 69 columns):
id_player      3052 non-null object
played_race    3044 non-null object
0              3044 non-null object
1              3044 non-null object
2              3030 non-null object
3              2954 non-null object
4              2909 non-null object
5              2852 non-null object
6              2804 non-null object
7              2723 non-null object
8              2646 non-null object
9              2578 non-null object
10             2484 non-null object
11             2375 non-null object
12             2281 non-null object
13             2184 non-null object
14             2059 non-null object
15             1932 non-null object
16             1830 non-null object
17             1732 non-null object
18             1630 non-null object
19             1530 non-null object
20             1441 non-null object
21             1347 non-null object
22             1285 non-null 

Unnamed: 0,id_player,played_race,0,1,2,3,4,5,6,7,...,57,58,59,60,61,62,63,64,65,66
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,Base,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,Base,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,


# Getting the datasets we're gonna work on ("counts dataset")

In [4]:
# Convert dataset into another one
df_training_numerical = de.transform_sample(df_training, True)
df_validation_numerical = de.transform_sample(df_validation, True)     # OK(see visualization)
df_testing_numerical =  de.transform_sample(df_testing, False)

In [5]:
df_training_numerical.info()
df_training_numerical.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null object
played_race      3052 non-null object
Base             3052 non-null int64
hotkey00         3052 non-null int64
hotkey30         3052 non-null int64
s                3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey90         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey50         3052 non-null int64
hotkey70         3052 non-null int64
hotkey80         3052 non-null int64
hotkey12         3052 non-null int64
SingleMineral    3052 non-null int64
hotkey52         3052 non-null int64
hotkey11         3052 non-null int64
hotkey21         3052 non-null int64
hot

Unnamed: 0,id_player,played_race,Base,hotkey00,hotkey30,s,hotkey32,hotkey40,hotkey42,hotkey02,...,hotkey41,hotkey51,hotkey61,hotkey71,hotkey82,hotkey81,hotkey91,hotkey92,hotkey01,hotkey72
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,1,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,3,0,1,11,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,3,0,1,8,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_testing_numerical.info()
df_testing_numerical.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 0 to 339
Data columns (total 34 columns):
played_race      340 non-null object
hotkey00         340 non-null int64
hotkey02         340 non-null int64
hotkey10         340 non-null int64
hotkey60         340 non-null int64
hotkey62         340 non-null int64
hotkey90         340 non-null int64
s                340 non-null int64
hotkey30         340 non-null int64
hotkey32         340 non-null int64
Base             340 non-null int64
hotkey20         340 non-null int64
hotkey22         340 non-null int64
hotkey40         340 non-null int64
hotkey70         340 non-null int64
hotkey80         340 non-null int64
hotkey50         340 non-null int64
hotkey12         340 non-null int64
hotkey61         340 non-null int64
SingleMineral    340 non-null int64
hotkey42         340 non-null int64
hotkey52         340 non-null int64
hotkey92         340 non-null int64
hotkey01         340 non-null int64
hotkey41         340 non-null 

Unnamed: 0,played_race,hotkey00,hotkey02,hotkey10,hotkey60,hotkey62,hotkey90,s,hotkey30,hotkey32,...,hotkey41,hotkey51,hotkey71,hotkey81,hotkey91,hotkey21,hotkey72,hotkey82,hotkey11,hotkey31
0,Zerg,1,12,1,1,12,1,13,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Terran,1,0,0,0,0,0,13,1,12,...,0,0,0,0,0,0,0,0,0,0
2,Protoss,0,0,0,1,8,0,9,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Zerg,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Protoss,0,0,0,0,0,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0


# Encoding

In [7]:
# Learning dataset encoding
encoder = ThreeFeaturesEncoder(df_training)

# Encoding dataset : training, validation and testing
encoded_df_training = encoder.encode_df(df_training_numerical)
encoded_df_validation = encoder.encode_df(df_validation_numerical)
encoded_df_testing = encoder.encode_df(df_testing_numerical, False)

In [8]:
encoded_df_training.info()
encoded_df_training.head(50)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null int64
played_race      3052 non-null int64
Base             3052 non-null int64
hotkey00         3052 non-null int64
hotkey30         3052 non-null int64
s                3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey90         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey50         3052 non-null int64
hotkey70         3052 non-null int64
hotkey80         3052 non-null int64
hotkey12         3052 non-null int64
SingleMineral    3052 non-null int64
hotkey52         3052 non-null int64
hotkey11         3052 non-null int64
hotkey21         3052 non-null int64
hotke

Unnamed: 0,id_player,played_race,Base,hotkey00,hotkey30,s,hotkey32,hotkey40,hotkey42,hotkey02,...,hotkey41,hotkey51,hotkey61,hotkey71,hotkey82,hotkey81,hotkey91,hotkey92,hotkey01,hotkey72
0,0,0,2,1,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,3,0,1,11,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,3,0,1,8,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,2,0,1,10,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,4,0,1,13,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,1,0,1,1,17,11,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
encoded_df_testing.info()
encoded_df_testing.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 0 to 339
Data columns (total 34 columns):
played_race      340 non-null int64
hotkey00         340 non-null int64
hotkey02         340 non-null int64
hotkey10         340 non-null int64
hotkey60         340 non-null int64
hotkey62         340 non-null int64
hotkey90         340 non-null int64
s                340 non-null int64
hotkey30         340 non-null int64
hotkey32         340 non-null int64
Base             340 non-null int64
hotkey20         340 non-null int64
hotkey22         340 non-null int64
hotkey40         340 non-null int64
hotkey70         340 non-null int64
hotkey80         340 non-null int64
hotkey50         340 non-null int64
hotkey12         340 non-null int64
hotkey61         340 non-null int64
SingleMineral    340 non-null int64
hotkey42         340 non-null int64
hotkey52         340 non-null int64
hotkey92         340 non-null int64
hotkey01         340 non-null int64
hotkey41         340 non-null i

Unnamed: 0,played_race,hotkey00,hotkey02,hotkey10,hotkey60,hotkey62,hotkey90,s,hotkey30,hotkey32,...,hotkey41,hotkey51,hotkey71,hotkey81,hotkey91,hotkey21,hotkey72,hotkey82,hotkey11,hotkey31
0,2,1,12,1,1,12,1,13,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,13,1,12,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,8,0,9,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0


# Conform validation and testing DataFrame columns to training DataFrame columns

In [10]:
encoded_df_validation_conf = de.conform_test_to_training(encoded_df_training, encoded_df_validation)
encoded_df_validation_conf = encoded_df_validation_conf.fillna(0)

encoded_df_testing_conf = de.conform_test_to_training(encoded_df_training, encoded_df_testing, False)
encoded_df_testing_conf = encoded_df_testing_conf.fillna(0)
print(encoded_df_testing_conf.head())

   played_race  Base  hotkey00  hotkey30   s  hotkey32  hotkey40  hotkey42  \
0            2     0         1         0  13         0         0         0   
1            1     0         1         1  13        12         0         0   
2            0     2         0         1   9         0         1         0   
3            2     0         0         0   5         0         1         0   
4            0     2         0         1   4         0         1         0   

   hotkey02  hotkey10    ...     hotkey41  hotkey51  hotkey61  hotkey71  \
0        12         1    ...            0         0         0         0   
1         0         0    ...            0         0         0         0   
2         0         0    ...            0         0         0         0   
3         0         0    ...            0         0         0         0   
4         0         0    ...            0         0         0         0   

   hotkey82  hotkey81  hotkey91  hotkey92  hotkey01  hotkey72  
0         0     

In [11]:
encoded_df_training.info()
encoded_df_training.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 3051
Data columns (total 35 columns):
id_player        3052 non-null int64
played_race      3052 non-null int64
Base             3052 non-null int64
hotkey00         3052 non-null int64
hotkey30         3052 non-null int64
s                3052 non-null int64
hotkey32         3052 non-null int64
hotkey40         3052 non-null int64
hotkey42         3052 non-null int64
hotkey02         3052 non-null int64
hotkey10         3052 non-null int64
hotkey60         3052 non-null int64
hotkey62         3052 non-null int64
hotkey90         3052 non-null int64
hotkey20         3052 non-null int64
hotkey22         3052 non-null int64
hotkey50         3052 non-null int64
hotkey70         3052 non-null int64
hotkey80         3052 non-null int64
hotkey12         3052 non-null int64
SingleMineral    3052 non-null int64
hotkey52         3052 non-null int64
hotkey11         3052 non-null int64
hotkey21         3052 non-null int64
hotke

Unnamed: 0,id_player,played_race,Base,hotkey00,hotkey30,s,hotkey32,hotkey40,hotkey42,hotkey02,...,hotkey41,hotkey51,hotkey61,hotkey71,hotkey82,hotkey81,hotkey91,hotkey92,hotkey01,hotkey72
0,0,0,2,1,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,3,0,1,11,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,3,0,1,8,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,2,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
encoded_df_testing_conf.info()
encoded_df_testing_conf.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 0 to 339
Data columns (total 34 columns):
played_race      340 non-null int64
Base             340 non-null int64
hotkey00         340 non-null int64
hotkey30         340 non-null int64
s                340 non-null int64
hotkey32         340 non-null int64
hotkey40         340 non-null int64
hotkey42         340 non-null int64
hotkey02         340 non-null int64
hotkey10         340 non-null int64
hotkey60         340 non-null int64
hotkey62         340 non-null int64
hotkey90         340 non-null int64
hotkey20         340 non-null int64
hotkey22         340 non-null int64
hotkey50         340 non-null int64
hotkey70         340 non-null int64
hotkey80         340 non-null int64
hotkey12         340 non-null int64
SingleMineral    340 non-null int64
hotkey52         340 non-null int64
hotkey11         340 non-null int64
hotkey21         340 non-null int64
hotkey31         340 non-null int64
hotkey41         340 non-null i

Unnamed: 0,played_race,Base,hotkey00,hotkey30,s,hotkey32,hotkey40,hotkey42,hotkey02,hotkey10,...,hotkey41,hotkey51,hotkey61,hotkey71,hotkey82,hotkey81,hotkey91,hotkey92,hotkey01,hotkey72
0,2,0,1,0,13,0,0,0,12,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,13,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,1,9,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2,0,1,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train, validate and test with class 'TrainValidateTest' and already got datasets+encoder from above (so instead of the 3 parts below) 

In [13]:
tvt = TrainValidateTest(100, 10)
tvt.train(encoded_df_training)
tvt.validate(encoded_df_validation_conf)
tvt.test(encoded_df_testing_conf, encoder)

# Train

In [14]:
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(encoded_df_training.iloc[:, 1:].values, encoded_df_training.id_player.values)
print("Recall features:, ", encoded_df_training.columns)
print("Feature relevances:", model.feature_importances_)

Recall features:,  Index(['id_player', 'played_race', 'Base', 'hotkey00', 'hotkey30', 's',
       'hotkey32', 'hotkey40', 'hotkey42', 'hotkey02', 'hotkey10', 'hotkey60',
       'hotkey62', 'hotkey90', 'hotkey20', 'hotkey22', 'hotkey50', 'hotkey70',
       'hotkey80', 'hotkey12', 'SingleMineral', 'hotkey52', 'hotkey11',
       'hotkey21', 'hotkey31', 'hotkey41', 'hotkey51', 'hotkey61', 'hotkey71',
       'hotkey82', 'hotkey81', 'hotkey91', 'hotkey92', 'hotkey01', 'hotkey72'],
      dtype='object')
Feature relevances: [0.07484539 0.0579948  0.0415314  0.04997344 0.08122103 0.05324084
 0.04868711 0.04872638 0.01086916 0.05390341 0.03440526 0.02255146
 0.03877519 0.05858977 0.04828792 0.04602584 0.02890096 0.0318136
 0.04970898 0.01791108 0.03863494 0.00384866 0.00457821 0.00422194
 0.00399152 0.00269469 0.00300379 0.00162495 0.002134   0.00223989
 0.0146424  0.00756332 0.01155908 0.00129959]


# Validate (doesn't have to do that before test)

In [15]:
nolabel_df_validation = encoded_df_validation_conf.drop(axis=1, labels="id_player")
predicted = model.predict(nolabel_df_validation.values)
labels = encoded_df_validation_conf.id_player.values
print(ae.get_accuracy(labels, predicted))

89.04899135446685


# Test

In [16]:
predicted = model.predict(encoded_df_testing_conf.values)
decoded_predicted = encoder.decode_labels(predicted) # We decode the encoded predictions
indices = range(1, len(predicted) + 1)
output_df = pd.DataFrame({"RowId": indices, "prediction": decoded_predicted})
output_df.to_csv("test_labels.CSV", index=False)