In [1]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split

# personal imports
import data_extractor as de
import accuracy_extractor as ae
from multi_labels_encoder import ThreeFeaturesEncoder
from train_n_test import TrainValidateTest

# Prepare train and test original datasets (+ labels)

In [2]:
# Extract dataset: train and validation and test (though validation and test are done independntly)
df_training = de.get_dataframe("../input/minitrain.CSV", training=True, limit_seconds=8) # OK
df_validation = de.get_dataframe("../input/minitest.CSV", training=True, limit_seconds=8) # OK
df_testing = de.get_dataframe("../input/TEST.CSV", training=False, limit_seconds=8)

# Encoding

In [3]:
# Learning dataset encoding
encoder = ThreeFeaturesEncoder(df_training)

# Encoding dataset : training, validation and testing
encoded_df_training = encoder.encode_df(df_training)
encoded_df_validation = encoder.encode_df(df_validation)
encoded_df_testing = encoder.encode_df(df_testing)

# Getting the datasets we're gonna work on

In [4]:
# Convert dataset into another one
df_training_numerical = de.transform_sample(encoded_df_training, True)
df_validation_numerical = de.transform_sample(encoded_df_validation, True)     # OK(see visualization)
df_testing_numerical =  de.transform_sample(encoded_df_testing, False)

# Conform validation and testing DataFrame columns to training DataFrame columns

In [5]:
df_validation_numerical = de.conform_test_to_training(df_training_numerical, df_validation_numerical)
df_validation_numerical = df_validation_numerical.fillna(0)

df_testing_numerical = de.conform_test_to_training(df_training_numerical, df_testing_numerical)
df_testing_numerical = df_testing_numerical.fillna(0)

# Train, validate and test with class 'TrainValidateTest' and already got datasets+encoder from above (so instead of the 3 parts below) 

In [17]:
tvt = TrainValidateTest(100, 10)
tvt.train(df_training_numerical)
tvt.validate(df_validation_numerical)
tvt.test(df_testing_numerical, encoder)

72.91066282420749


# Train

In [22]:
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(df_training_numerical.iloc[:, 1:].values, df_training_numerical.id_player.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Validate (doesn't have to do that before test)

In [23]:
nolabel_df_validation = df_validation_numerical.drop(axis=1, labels="id_player")
predicted = model.predict(nolabel_df_validation.values)
labels = df_validation_numerical.id_player.values
print(ae.get_accuracy(labels, predicted))

73.19884726224784


# Test

In [20]:
nolabel_df_testing = df_testing_numerical.drop(axis=1, labels="id_player")
predicted = model.predict(nolabel_df_testing.values)
decoded_predicted = encoder.decode_labels(predicted) # We decode the encoded predictions
indices = range(1, len(predicted) + 1)
output_df = pd.DataFrame({"RowId": indices, "prediction": decoded_predicted})
output_df.to_csv("test_labels.CSV", index=False)