In [27]:
import csv
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection  import  train_test_split

# personal imports
import data_extractor
from encoder_categorical_numerical import Encoder_Categorical_Numerical
import accuracy_extractor

In [28]:
validation = True
index_beginning_actions = 4
upper_limit_seconds = 150
lower_limit_seconds = 5

training_file = "./input/TRAIN.CSV" if not validation else "./input/minitrain.csv"
validation_file = "./input/minitest.csv"
#validation_file
testing_file = "./input/TEST.CSV" if not validation else "./input/minitest.csv"

### Calculer le temps moyen d'une partie

In [29]:
def __ceil_to_five__(seconds):
    """Approximates the number of seconds to the upper multiple of 5.
    :param seconds: number of seconds to approximate
    :return: approxilated seconds
    """
    modulo = seconds % 5
    if modulo == 0:
        return seconds
    else:
        return seconds - modulo + 5

In [30]:
def __remove_tX__(row):
    """Get a new list of words with words of type tX (X a number) removed
    :param myrow: a list of words"""
    tX = re.compile("t\d")
    new_row = []
    for cell in row:
        if tX.match(cell):
            pass
        else:
            new_row.append(cell)
    return new_row

In [31]:
def __get_maximum_time_match__(row, default_time=0):
    """Get the approximate time of a match
    :param myrow: one match"""
    tX = re.compile("t\d")
    for cell in row[::-1]:
        if tX.match(cell):
            return int(cell[1:])
    return default_time

In [32]:
a = ["t5", "a","b","c", "t18","a","b","c", "t150", "a","b"]
__get_maximum_time_match__(a, 0)

150

### Calculer le temps moyen entre deux actions

In [33]:
def __get_speed__(row, max_time):
    tX = re.compile("t\d")
    nb_actions = len(row)
    for cell in row:
        if tX.match(cell):
            nb_actions = nb_actions-1
    return max_time/nb_actions

In [34]:
a = ["t5", "a","b","c", "t18","a","b","c", "t150", "a","b"]
__get_speed__(a, 150)

18.75

### Connaître toutes les actions possibles

In [35]:
def __recognise_possible_actions__(set_of_actions, row):
    for cell in row[index_beginning_actions:]:
        set_of_actions.add(cell)
    return set_of_actions

# Get dataframe

In [36]:
def get_dataframe(path_file, training, upper_limit_seconds, lower_limit_seconds):
    """
    :param path_file:
    :param training: boolean
    :param limit_seconds: int
    :return: A DataFrame with columns named id_player, played_race, 0... n, with n the number of kept actions.
    """
    upper_limit_seconds = __ceil_to_five__(upper_limit_seconds)
    upper_stop_word = "t" + str(upper_limit_seconds)
    lower_limit_seconds = __ceil_to_five__(lower_limit_seconds)
    lower_stop_word = "t" + str(lower_limit_seconds)
    
    extracted = []
    largest_column_count = 0
    possible_actions = set()
    index_new_feature = 2 if training else 1
    lower_column_to_keep = []

    # Loop the data lines
    with open(path_file) as csvfile:
        spamreader = csv.reader(csvfile, delimiter='\n')
        for row in spamreader:
            myrow = row[0].split(',')
            try:
                upper_stop_index = myrow.index(upper_stop_word)
            except:
                upper_stop_index = -1
            try:
                lower_stop_index = myrow.index(lower_stop_word)
            except:
                lower_stop_index = -1
            #compute the time of a match and the average speed between two actions
            time_match = __get_maximum_time_match__(myrow)
            average_speed_match = __get_speed__(myrow, time_match)
            #compute the number of columns to keep for ordered actions
            lower_column_to_keep.append(len(__remove_tX__(myrow[index_new_feature:lower_stop_index]))-1)
            #delete useless tX
            myrow = __remove_tX__(myrow[0:upper_stop_index])
            #know possible set of actions
            possible_actions = __recognise_possible_actions__(possible_actions, myrow)
            #compute the average speed between two actions for a specifical time lapse
            #average_speed_limited_time = __get_speed__(myrow, limit_seconds)
            #insert time_match and average_speed_match as features
            myrow.insert(index_new_feature, time_match)
            myrow.insert(index_new_feature+1, average_speed_match)         
            #myrow.insert(index_time_match+2, average_speed_limited_time)
            #count the number of columns
            largest_column_count = max(len(myrow), largest_column_count)
            extracted.append(myrow)
    column_names = []
    if training:
        column_names = ["id_player", "played_race", "time_match", "average_speed_match"] + [(str(i)+"th_action") for i in range(0, largest_column_count - index_beginning_actions)]
    else:
        column_names = ["played_race", "time_match", "average_speed_match"] + [(str(i)+"th_action") for i in range(0, largest_column_count - index_beginning_actions+1)]
    return pd.DataFrame(extracted, columns = column_names), possible_actions, lower_column_to_keep

In [37]:
df_training, possible_actions_training, lower_column_to_keep_training = get_dataframe(training_file, training=True, upper_limit_seconds=upper_limit_seconds, lower_limit_seconds=lower_limit_seconds)
df_testing, possible_actions_testing, lower_column_to_keep_testing = get_dataframe(testing_file, training=validation, upper_limit_seconds=upper_limit_seconds, lower_limit_seconds=lower_limit_seconds)
possible_actions = list(possible_actions_training.union(possible_actions_testing))
index_column_to_delete = max((max(lower_column_to_keep_training)+index_beginning_actions), (max(lower_column_to_keep_testing)+index_beginning_actions-1))+1

In [38]:
df_training.head()
df_testing.head()

Unnamed: 0,id_player,played_race,time_match,average_speed_match,0th_action,1th_action,2th_action,3th_action,4th_action,5th_action,...,1024th_action,1025th_action,1026th_action,1027th_action,1028th_action,1029th_action,1030th_action,1031th_action,1032th_action,1033th_action
0,http://eu.battle.net/sc2/en/profile/250458/1/V...,Zerg,880,0.466102,s,s,s,s,hotkey20,s,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3538115/1/...,Zerg,1130,0.261635,s,s,s,hotkey10,s,hotkey60,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/250458/1/V...,Zerg,1125,0.465068,Base,s,s,s,hotkey20,Base,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3973341/1/...,Protoss,1655,0.368843,Base,s,s,Base,s,hotkey20,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/2452136/1/...,Protoss,1650,0.337907,s,Base,s,hotkey90,hotkey00,s,...,,,,,,,,,,


## Aligner le nombre de colonnes pour training and testing datasets

In [39]:
columns_to_add = (set(df_training.columns) - set(df_testing.columns))-set(["id_player"])
for column in columns_to_add: 
    df_testing[column] = np.nan
if validation:
    training_columns_l = len(df_training.columns)-1
    testing_columns_l = len(df_testing.columns)
    if testing_columns_l>training_columns_l:
        columns_to_add = (set(df_testing.columns) - set(df_training.columns))-set(["id_player"])
        for column in columns_to_add: 
            df_training[column] = np.nan

index_end_actions = len(df_training.columns)

## Calculer l'occurence de chaque action pendant une durée donnée

In [40]:
def __row_of_occurence_actions__(actions, counts, list_of_all_actions):
    dict_of_occurence = {action: 0 for action in list_of_all_actions}
    for action, count in zip(actions, counts):
        dict_of_occurence[action] = count   
    return [int(dict_of_occurence[action]) for action in list_of_all_actions]
        
def __compute_occurence_actions__(df, list_of_all_actions, training=True):
    df_occurence_actions = pd.DataFrame(0, index=df.index, columns=list_of_all_actions, dtype=int)
    for index, row in df.iterrows():
        column_index_start = index_beginning_actions if training else index_beginning_actions-1
        row_to_examine = df.iloc[index,column_index_start:]
        actions, counts = np.unique(row_to_examine.dropna().values, return_counts=True)
        df_occurence_actions.iloc[index,] = __row_of_occurence_actions__(actions, counts, list_of_all_actions)
    return df.join(df_occurence_actions)

In [41]:
df_training = __compute_occurence_actions__(df_training, possible_actions)
df_testing = __compute_occurence_actions__(df_testing, possible_actions, validation)

In [42]:
df_testing.head()

Unnamed: 0,id_player,played_race,time_match,average_speed_match,0th_action,1th_action,2th_action,3th_action,4th_action,5th_action,...,hotkey60,hotkey21,hotkey41,hotkey82,hotkey71,hotkey90,hotkey51,s,hotkey30,hotkey81
0,http://eu.battle.net/sc2/en/profile/250458/1/V...,Zerg,880,0.466102,s,s,s,s,hotkey20,s,...,0,0,0,0,0,0,0,127,0,0
1,http://eu.battle.net/sc2/en/profile/3538115/1/...,Zerg,1130,0.261635,s,s,s,hotkey10,s,hotkey60,...,1,0,0,0,0,1,0,126,0,0
2,http://eu.battle.net/sc2/en/profile/250458/1/V...,Zerg,1125,0.465068,Base,s,s,s,hotkey20,Base,...,0,0,0,0,0,0,0,119,0,0
3,http://eu.battle.net/sc2/en/profile/3973341/1/...,Protoss,1655,0.368843,Base,s,s,Base,s,hotkey20,...,1,0,0,0,0,0,0,142,5,0
4,http://eu.battle.net/sc2/en/profile/2452136/1/...,Protoss,1650,0.337907,s,Base,s,hotkey90,hotkey00,s,...,0,0,0,0,0,1,0,198,2,0


## Delete colums to have lower ordered actions

In [43]:
def __delete_columns_for_lower_limit__(df, begin_index, end_index):
    columns_to_delete = list(df.columns[begin_index:end_index])
    return df.drop(columns_to_delete, axis=1)

In [44]:
index_column_to_delete = index_beginning_actions+5
df_training = __delete_columns_for_lower_limit__(df_training, index_column_to_delete, index_end_actions)
df_testing = __delete_columns_for_lower_limit__(df_testing, index_column_to_delete, index_end_actions)
index_end_actions = index_column_to_delete

## Encode values

In [45]:
#Encodage
encoder = Encoder_Categorical_Numerical(df_training, index_beginning_actions, index_end_actions)
df_encoded_training = encoder.encode_df(df_training)
df_encoded_testing = encoder.encode_df(df_testing, validation)

In [46]:
df_encoded_training.head()
df_encoded_testing.head()

Unnamed: 0,id_player,played_race,time_match,average_speed_match,0th_action,1th_action,2th_action,3th_action,4th_action,hotkey72,...,hotkey60,hotkey21,hotkey41,hotkey82,hotkey71,hotkey90,hotkey51,s,hotkey30,hotkey81
0,18,2,880,0.466102,25,25,25,25,11,0,...,0,0,0,0,0,0,0,127,0,0
1,30,2,1130,0.261635,25,25,25,1,25,0,...,1,0,0,0,0,1,0,126,0,0
2,18,2,1125,0.465068,9,25,25,25,11,0,...,0,0,0,0,0,0,0,119,0,0
3,2,1,1655,0.368843,9,25,25,9,25,1,...,1,0,0,0,0,0,0,142,5,0
4,16,1,1650,0.337907,25,9,25,23,7,0,...,0,0,0,0,0,1,0,198,2,0


### For ordered actions, fill with -1 after lower time limit

In [47]:
df_encoded_training = df_encoded_training.fillna(-1)
df_encoded_testing = df_encoded_testing.fillna(-1)

In [48]:
def __remove_values_to_agree_lower_limit__(df, training, lower_limit_colums, end_index):
    for index_row, begin_index_column in enumerate(lower_limit_colums):
        begin_index_column = begin_index_column + index_beginning_actions
        if training:
            begin_index_column += 1
        df.iloc[index_row, begin_index_column:end_index] = -1
    return df

In [49]:
df_encoded_training = __remove_values_to_agree_lower_limit__(df_encoded_training, True, lower_column_to_keep_training, index_end_actions)
df_encoded_testing = __remove_values_to_agree_lower_limit__(df_encoded_testing, validation, lower_column_to_keep_testing, index_end_actions)

In [50]:
df_encoded_training.head()
df_encoded_testing.head()

Unnamed: 0,id_player,played_race,time_match,average_speed_match,0th_action,1th_action,2th_action,3th_action,4th_action,hotkey72,...,hotkey60,hotkey21,hotkey41,hotkey82,hotkey71,hotkey90,hotkey51,s,hotkey30,hotkey81
0,18,2,880,0.466102,25,25,25,25,11,0,...,0,0,0,0,0,0,0,127,0,0
1,30,2,1130,0.261635,25,25,25,1,25,0,...,1,0,0,0,0,1,0,126,0,0
2,18,2,1125,0.465068,9,25,25,25,11,0,...,0,0,0,0,0,0,0,119,0,0
3,2,1,1655,0.368843,9,25,25,9,25,1,...,1,0,0,0,0,0,0,142,5,0
4,16,1,1650,0.337907,25,9,25,23,7,0,...,0,0,0,0,0,1,0,198,2,0


# Training

In [51]:
decision_tree = RandomForestClassifier(n_estimators = 150, max_depth = 500)
decision_tree.fit(df_encoded_training.iloc[:, 1:].values, df_encoded_training.id_player.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=500, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Testing

In [165]:
predicted = decision_tree.predict(df_encoded_testing.values)
decoded_predicted = encoder.decode_labels(predicted)  # We decode the encoded predictions
indices = range(1, len(predicted) + 1)
output_df = pd.DataFrame({"RowId": indices, "prediction": decoded_predicted})
output_df.to_csv("test_labels.CSV", index=False)

# Small validation

In [52]:
nolabel_df_validation = df_encoded_testing.drop(axis=1, labels="id_player")
nolabel_df_validation.fillna(0)
predicted = decision_tree.predict(nolabel_df_validation.values)
labels = df_encoded_testing.id_player.values
print("accuracy:", accuracy_extractor.get_accuracy(labels, predicted))

accuracy: 87.89625360230548
