In [1]:
import csv
from pandas import DataFrame
import sklearn as sk

# personal imports
import data_extractor as de
from MultiColumnLabelEncoder import MultiColumnLabelEncoder

# --------- FOR THE TRAINING PART ----------

# Extracting training data ('till 5 seconds for now)

In [2]:
mycsv = open("data/TRAIN.CSV")
limit_seconds = 5
extracted_df, labels = de.extract_data_till_time_df(mycsv, limit_seconds)
#print("extracted_df:\n", extracted_df)
#print("labels:\n", labels)

# Encoding training features and labels (needed for using method fit from the predictors). Need to keep/save the encoders for the decoding part.

## Encoding labels = encoding a list

In [3]:
from sklearn.preprocessing import LabelEncoder

#print("labels:", labels)
le = LabelEncoder()
le.fit(labels) # Fill the Label Encoders with given duplicate labels (that are for example in a list)
#print(le.classes_) # Corresponding set of these labels
encoded_labels = le.transform(labels) # A list of the labels that in their encoding form using le
#print("encoded_labels", encoded_labels)

## Encoding features = encoding a DF

In [4]:
print("extracted:\n", extracted_df)

mle = MultiColumnLabelEncoder()
mle.fit(extracted_df) # Fill the Label Encoders with given duplicate labels (that are for example in a list)
#print("mle.classes:", mle.classes) # Corresponding set of these labels
encoded_features = mle.transform(extracted_df) # A list of the labels that in their encoding form using le
#print("encoded_features", encoded_features)

extracted:
            0     1         2         3         4         5         6   \
0     Protoss  Base         s         s         s         s         s   
1     Protoss     s      Base         s         s      Base         s   
2     Protoss  Base         s         s         s      Base         s   
3     Protoss  Base         s         s      Base         s         s   
4     Protoss  Base         s         s         s      Base         s   
5     Protoss     s      Base         s         s      Base         s   
6     Protoss  Base         s         s         s      Base         s   
7     Protoss  Base         s         s      Base         s         s   
8     Protoss  Base         s  hotkey40       unk       unk       unk   
9      Terran     s  hotkey30  hotkey00       unk       unk       unk   
10    Protoss  Base         s  hotkey40       unk       unk       unk   
11     Terran     s  hotkey30  hotkey00       unk       unk       unk   
12    Protoss  Base         s  hotkey40

# Training a model on the encoded training set

In [5]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(encoded_features.values, encoded_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# --------------- FOR THE TESTING PART -----------------

# Extract test data (features + labels) (from training set) ('till 5 seconds') in an encoded form

In [6]:
# TODO: Beware when extracting test data: of the size of the elements must be the same size as the size of the train elements!!!

# 1/ Extract
n_features = len(extracted_df.columns.values)
mycsv_test = open("data/minitest.csv")
extracted_df_test, labels_test = de.extract_data_till_time_df(mycsv_test, limit_seconds, data_role=1, n_features=n_features)
print("extracted_df for test:\n", extracted_df_test)
print("test_labels:\n", labels_test)

# 2/ Encode: We use the encoders contruted from elements of training set
# 2.1 Labels
encoded_labels_test = le.transform(labels_test)
# 2.2 Features
encoded_features = mle.transform(extracted_df_test) 

extracted_df for test:
        0  1  2         3         4         5         6         7         8   \
0    Zerg  s  s         s         s  hotkey40         s  hotkey20  hotkey42   
1  Terran  s  s  hotkey10         s  hotkey20         s         s  hotkey12   
2    Zerg  s  s         s         s         s  hotkey40  hotkey20         s   
3  Terran  s  s         s  hotkey10  hotkey20  hotkey30  hotkey40  hotkey50   
4  Terran  s  s  hotkey10         s  hotkey20         s         s  hotkey12   
5  Terran  s  s         s         s         s  hotkey10  hotkey20  hotkey30   
6  Terran  s  s  hotkey10         s  hotkey20         s         s  hotkey12   
7  Terran  s  s         s  hotkey10  hotkey20  hotkey30  hotkey40  hotkey50   
8  Terran  s  s  hotkey10         s  hotkey20         s         s  hotkey12   
9  Terran  s  s         s  hotkey10  hotkey20  hotkey30  hotkey40  hotkey50   

         9  ...    29   30   31   32   33   34   35   36   37   38  
0  hotkey22 ...   unk  unk  unk  unk 

# Test on the test data from training set

In [13]:
import numpy as np

# 3/ Predict
encoded_predicted = model.predict(encoded_features.values)

# 4/ Display score
comparison_list = encoded_predicted == encoded_labels_test
print("Nb exacts:", np.sum(comparison_list), "/", len(comparison_list))

Nb exacts: 9 / 10


# Decode predicted (on test data from train data) and display them

In [15]:
decoded_predicted = le.inverse_transform(encoded_predicted)
print("predicted:", decoded_predicted)

predicted: ['http://kr.battle.net/sc2/en/profile/2342120/1/Soulkey/'
 'http://xx.battle.net/sc2/en/profile/410/1/STBomber/'
 'http://xx.battle.net/sc2/en/profile/401/1/Soulkey/'
 'http://xx.battle.net/sc2/en/profile/405/1/MMA/'
 'http://xx.battle.net/sc2/en/profile/410/1/STBomber/'
 'http://xx.battle.net/sc2/en/profile/405/1/MMA/'
 'http://xx.battle.net/sc2/en/profile/410/1/STBomber/'
 'http://xx.battle.net/sc2/en/profile/405/1/MMA/'
 'http://xx.battle.net/sc2/en/profile/410/1/STBomber/'
 'http://xx.battle.net/sc2/en/profile/405/1/MMA/']


  if diff:


# ------------------ FOR THE APPLYING PART -------------------

# Extract and encode

In [16]:
mycsv_guess = open("data/TEST.CSV")
extracted_df_guess, _ = de.extract_data_till_time_df(mycsv_guess, limit_seconds, data_role = 2, n_features=n_features)
encoded_features_guess = mle.transform(extracted_df_guess) 

encoded_predicted_guess = model.predict(encoded_features_guess.values)

ValueError: y contains new labels: ['Base' 'SingleMineral' 's']

## Applying (with test file)