In [1]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from train_n_test import TrainValidateTest

# Prepare train and test original datasets (+ labels)

In [2]:
# Extract dataset: train and validation and test (though validation and test are done independntly)
df_training = de.get_dataframe("../input/minitrain.CSV", training=True, limit_seconds=8) # OK
df_validation = de.get_dataframe("../input/minitest.CSV", training=True, limit_seconds=8) # OK
df_testing = de.get_dataframe("../input/TEST.CSV", training=False, limit_seconds=8)

# Encoding

In [3]:
# Learning dataset encoding
encoder = ThreeFeaturesEncoder(df_training)

# Encoding dataset : training, validation and testing
encoded_df_training = encoder.encode_df(df_training)
encoded_df_validation = encoder.encode_df(df_validation)
encoded_df_testing = encoder.encode_df(df_testing, False)

# Getting the datasets we're gonna work on

In [4]:
# Convert dataset into another one
df_training_numerical = de.transform_sample(encoded_df_training, True)
df_validation_numerical = de.transform_sample(encoded_df_validation, True)     # OK(see visualization)
df_testing_numerical =  de.transform_sample(encoded_df_testing, False)

# Conform validation and testing DataFrame columns to training DataFrame columns

In [5]:
df_validation_numerical = de.conform_test_to_training(df_training_numerical, df_validation_numerical)
df_validation_numerical = df_validation_numerical.fillna(0)

df_testing_numerical = de.conform_test_to_training(df_training_numerical, df_testing_numerical, False)
df_testing_numerical = df_testing_numerical.fillna(0)
df_testing_numerical.head()

train: True
intersection: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 'played_race', 'id_player'}
extra: {14, 15, 17, 24, 25}
extra after: {14, 15, 17, 24, 25}
testordered_train_cols: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 'played_race', 'id_player', 14, 15, 17, 24, 25]
train: False
intersection: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 'played_race'}
extra: {'id_player'}
extra after: set()
testordered_train_cols: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 'played_race']


Unnamed: 0,played_race,3,5,11,16,26,10,19,27,2,...,17,28,24,1,15,20,30,25,12,18
0,2.0,25.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,38.0,0.0,1.0,12.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,37.0,0.0,1.0,0.0,9.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,58.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,56.0,0.0,1.0,0.0,4.0,0.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train, validate and test with class 'TrainValidateTest' and already got datasets+encoder from above (so instead of the 3 parts below) 

In [6]:
tvt = TrainValidateTest(100, 10)
tvt.train(df_training_numerical)
tvt.validate(df_validation_numerical)
tvt.test(df_testing_numerical, encoder)

accuracy: 71.46974063400576


# Train

In [7]:
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(df_training_numerical.iloc[:, 1:].values, df_training_numerical.id_player.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Validate (doesn't have to do that before test)

In [8]:
nolabel_df_validation = df_validation_numerical.drop(axis=1, labels="id_player")
predicted = model.predict(nolabel_df_validation.values)
labels = df_validation_numerical.id_player.values
print(ae.get_accuracy(labels, predicted))

68.58789625360231


# Test

In [9]:
predicted = model.predict(df_testing_numerical.values)
decoded_predicted = encoder.decode_labels(predicted) # We decode the encoded predictions
indices = range(1, len(predicted) + 1)
output_df = pd.DataFrame({"RowId": indices, "prediction": decoded_predicted})
output_df.to_csv("test_labels.CSV", index=False)