**Initialisation**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


Format data for training set and test set

In [2]:
def formatTrain(file):
    profile = []
    race = []
    moves = []
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar="'")
        for row in csvreader:
            profile.append(row[0])
            race.append(row[1])
            moves.append(row[2:])
    d = { 'Profile':profile, 'Race':race, 'Moves':moves}  
    df = pd.DataFrame(data=d)
    return df

def formatTest(file):
    race = []
    moves = []
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar="'")
        for row in csvreader:
            race.append(row[0])
            moves.append(row[1:])
    d = {'Race':race, 'Moves':moves}  
    df = pd.DataFrame(data=d)
    return df

In [3]:
train_data = formatTrain('../input/train.csv/TRAIN.CSV')
#print(df)
test_data = formatTest('../input/test.csv/TEST.CSV')
#print(test_df)

Display unique players

In [4]:
print("Different profiles:", train_data["Profile"].unique(), sep="\n" )


Different profiles:
['http://eu.battle.net/sc2/en/profile/4234852/1/First/'
 'http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/'
 'http://eu.battle.net/sc2/en/profile/3401218/1/Welmu/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/3538115/1/Golden/'
 'http://eu.battle.net/sc2/en/profile/250458/1/VortiX/'
 'http://eu.battle.net/sc2/en/profile/3973341/1/yoeFWSan/'
 'http://eu.battle.net/sc2/en/profile/2452136/1/MinChul/'
 'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/'
 'http://eu.battle.net/sc2/en/profile/4341883/1/Patience/'
 'http://eu.battle.net/sc2/en/profile/950504/1/Grubby/'
 'http://eu.battle.net/sc2/en/profile/2898004/1/MMA/'
 'http://eu.battle.net/sc2/en/profile/251061/1/LiveZerg/'
 'http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/'
 'http://eu.battle.net/sc2/en/profile/326029/1/LiquidTLO/'
 'http://eu.battle.net/sc2/en/profile/1058669/1/Happy/'
 'http://eu.battle.net/sc2/en/profile/1139573/1/BabyKnight/'
 'ht

<h3>Get every unique move for all the players</h3>

In [5]:
def get_unique_moves(df):
    moves = df['Moves']
    unique_moves = set()
    for move in moves:
        for a in move:
            a = a.strip()
            if a[0]!='t':
                unique_moves.add(a)
    return unique_moves       

unique_moves = sorted(list(get_unique_moves(train_data)))
print(unique_moves)


['Base', 'SingleMineral', 'hotkey00', 'hotkey01', 'hotkey02', 'hotkey10', 'hotkey11', 'hotkey12', 'hotkey20', 'hotkey21', 'hotkey22', 'hotkey30', 'hotkey31', 'hotkey32', 'hotkey40', 'hotkey41', 'hotkey42', 'hotkey50', 'hotkey51', 'hotkey52', 'hotkey60', 'hotkey61', 'hotkey62', 'hotkey70', 'hotkey71', 'hotkey72', 'hotkey80', 'hotkey81', 'hotkey82', 'hotkey90', 'hotkey91', 'hotkey92', 's']


<h3>Define features</h3>

In [6]:
def define_features(df):  
    actions = unique_moves
    t60_actions = ["t60_" + moves for moves in unique_moves ]
    features = []
    for index, row in df.iterrows():
        race = row["Race"]
        player_moves = row["Moves"]
        #print(player_moves)
        moves_count = {move:0 for move in actions}
        moves_60_count = {move:0 for move in t60_actions}
        t60_moves = 0
        t120_moves = 0
        
        for action in player_moves:
            if action in unique_moves:
                moves_count[action]+=1
        if "t60" in player_moves:
            t60 = player_moves[0:player_moves.index("t60")]
            for action in t60:
                if action in unique_moves:
                    t60_moves += 1
                    moves_60_count["t60_"+action]+=1
        if "t120" in player_moves:
            t120 = player_moves[player_moves.index("t60"):player_moves.index("t120")]
            for action in t120:
                if action in unique_moves:
                    t120_moves += 1
        current = [race, *[moves_count[move] for move in actions], t60_moves, t120_moves,*[moves_60_count[move] for move in t60_actions]]
        features.append(current)
        new_df = pd.DataFrame(features, columns=["Race", *actions, "t60_moves", "t120_moves", *t60_actions])
    return new_df



<h3>Define features of training set and test set</h3>

Define features for training data

In [7]:
train_features = define_features(train_data)
train_features = pd.get_dummies(train_features, columns = ["Race"])

train_features.head()

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey01,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,hotkey22,hotkey30,hotkey31,hotkey32,hotkey40,hotkey41,hotkey42,hotkey50,hotkey51,hotkey52,hotkey60,hotkey61,hotkey62,hotkey70,hotkey71,hotkey72,hotkey80,hotkey81,hotkey82,hotkey90,hotkey91,hotkey92,s,t60_moves,t120_moves,t60_Base,t60_SingleMineral,t60_hotkey00,t60_hotkey01,t60_hotkey02,t60_hotkey10,t60_hotkey11,t60_hotkey12,t60_hotkey20,t60_hotkey21,t60_hotkey22,t60_hotkey30,t60_hotkey31,t60_hotkey32,t60_hotkey40,t60_hotkey41,t60_hotkey42,t60_hotkey50,t60_hotkey51,t60_hotkey52,t60_hotkey60,t60_hotkey61,t60_hotkey62,t60_hotkey70,t60_hotkey71,t60_hotkey72,t60_hotkey80,t60_hotkey81,t60_hotkey82,t60_hotkey90,t60_hotkey91,t60_hotkey92,t60_s,Race_Protoss,Race_Terran,Race_Zerg
0,66,5,5,0,41,63,0,350,37,0,185,1,0,401,5,0,255,4,0,114,3,0,18,1,0,0,0,0,0,4,0,0,674,35,102,2,1,1,0,0,0,0,0,0,0,0,1,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,1,0,0
1,12,0,2,0,83,65,3,448,15,0,215,7,0,342,3,0,308,0,0,0,0,0,0,0,0,0,1,0,16,1,0,18,538,88,119,3,0,0,0,0,2,0,3,0,0,0,1,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41,1,0,0
2,47,3,3,0,18,17,0,130,9,0,126,1,0,320,7,0,130,3,0,88,1,0,11,0,0,0,0,0,0,2,0,2,430,75,110,2,0,1,0,0,0,0,0,0,0,0,1,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38,1,0,0
3,21,0,2,0,39,32,1,259,14,0,70,6,0,245,3,0,284,0,0,0,0,0,0,0,0,0,0,0,0,1,0,14,397,76,101,3,0,0,0,0,1,0,2,0,0,0,1,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37,1,0,0
4,2,0,1,0,0,10,0,52,6,0,38,1,0,200,2,0,84,1,0,14,0,0,0,0,0,0,0,0,0,0,0,0,208,72,85,2,0,1,0,0,0,0,0,0,0,0,1,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,1,0,0


Define features for test data

In [8]:
test_features = define_features(test_data)
test_features = pd.get_dummies(test_features, columns = ["Race"])

test_features.head()

Unnamed: 0,Base,SingleMineral,hotkey00,hotkey01,hotkey02,hotkey10,hotkey11,hotkey12,hotkey20,hotkey21,hotkey22,hotkey30,hotkey31,hotkey32,hotkey40,hotkey41,hotkey42,hotkey50,hotkey51,hotkey52,hotkey60,hotkey61,hotkey62,hotkey70,hotkey71,hotkey72,hotkey80,hotkey81,hotkey82,hotkey90,hotkey91,hotkey92,s,t60_moves,t120_moves,t60_Base,t60_SingleMineral,t60_hotkey00,t60_hotkey01,t60_hotkey02,t60_hotkey10,t60_hotkey11,t60_hotkey12,t60_hotkey20,t60_hotkey21,t60_hotkey22,t60_hotkey30,t60_hotkey31,t60_hotkey32,t60_hotkey40,t60_hotkey41,t60_hotkey42,t60_hotkey50,t60_hotkey51,t60_hotkey52,t60_hotkey60,t60_hotkey61,t60_hotkey62,t60_hotkey70,t60_hotkey71,t60_hotkey72,t60_hotkey80,t60_hotkey81,t60_hotkey82,t60_hotkey90,t60_hotkey91,t60_hotkey92,t60_s,Race_Protoss,Race_Terran,Race_Zerg
0,19,3,14,0,847,7,31,352,10,0,67,3,0,2,0,0,0,0,0,0,3,1,981,4,0,43,0,0,0,6,0,29,848,242,203,0,0,5,0,80,1,0,2,1,0,4,0,0,0,0,0,0,0,0,0,1,0,89,0,0,0,0,0,0,1,0,0,58,0,0,1
1,0,0,2,0,33,14,0,336,31,0,150,1,0,467,3,0,402,1,0,157,0,0,0,0,0,0,0,0,0,1,0,47,572,118,243,0,0,1,0,0,1,0,11,1,0,0,1,0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,1,0
2,20,0,1,0,0,10,0,479,53,0,628,60,0,251,2,0,217,0,0,0,1,0,1130,3,0,32,5,0,49,0,0,0,556,170,196,2,0,0,0,0,0,0,0,4,0,14,4,0,6,1,0,0,0,0,0,1,0,76,1,0,0,1,0,0,0,0,0,60,1,0,0
3,17,2,0,0,0,68,89,525,13,6,129,4,2,30,3,1,528,2,0,3,1,0,3,0,0,0,0,0,0,0,0,0,1588,86,104,0,0,0,0,0,0,0,0,1,0,13,0,0,0,1,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38,0,0,1
4,21,0,3,0,0,4,7,74,8,0,30,11,0,16,5,0,24,1,0,83,0,0,0,1,0,3,4,0,1,0,0,0,236,48,61,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,19,0,0,0,0,0,0,0,0,0,0,0,0,24,1,0,0


Input features

In [9]:
Y =  train_data['Profile']
X = train_features

Split train data for local test

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(2441, 71) (2441,)
(611, 71) (611,)


<h3>Decision Tree Classifier</h3>

In [11]:
dt_clf = tree.DecisionTreeClassifier(min_samples_split=4, random_state=99).fit(X, Y)
predicted_dt = dt_clf.predict(test_features)
print(predicted_dt[0:10])

['http://eu.battle.net/sc2/en/profile/3538115/1/Golden/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/3973341/1/yoeFWSan/'
 'http://eu.battle.net/sc2/en/profile/250458/1/VortiX/'
 'http://eu.battle.net/sc2/en/profile/1139573/1/BabyKnight/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/4234852/1/First/'
 'http://eu.battle.net/sc2/en/profile/884897/1/LiquidSnute/'
 'http://eu.battle.net/sc2/en/profile/2526293/1/Krr/'
 'http://eu.battle.net/sc2/en/profile/377576/1/LiquidMaNa/']


In [12]:
dt_clf = tree.DecisionTreeClassifier(min_samples_split=4, random_state=99).fit(X_train, Y_train)
predicted_dt_local = dt_clf.predict(X_test)
print(metrics.f1_score(Y_test,predicted_dt_local,average='micro'))

0.7659574468085105


In [13]:
scores = cross_val_score(dt_clf, train_features, Y, cv=3)
print(scores)

[0.7273559  0.76063304 0.71940928]


<h3>Random forest</h3>

Random forest using best parameters from RandomGridSearchCV

In [14]:
rd_forest_clf = RandomForestClassifier(n_estimators=200, bootstrap= False, criterion='entropy', max_depth= 18, max_features= 5,min_samples_split= 2, random_state=11).fit(X,Y)
predicted_rd_forest = rd_forest_clf.predict(test_features)
print(predicted_rd_forest[0:10])

['http://eu.battle.net/sc2/en/profile/3538115/1/Golden/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/3973341/1/yoeFWSan/'
 'http://eu.battle.net/sc2/en/profile/250458/1/VortiX/'
 'http://eu.battle.net/sc2/en/profile/950504/1/Grubby/'
 'http://eu.battle.net/sc2/en/profile/2896854/1/MǂForGG/'
 'http://eu.battle.net/sc2/en/profile/4234852/1/First/'
 'http://eu.battle.net/sc2/en/profile/884897/1/LiquidSnute/'
 'http://eu.battle.net/sc2/en/profile/2526293/1/Krr/'
 'http://eu.battle.net/sc2/en/profile/377576/1/LiquidMaNa/']


In [15]:
clf = RandomForestClassifier(n_estimators=100)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": sp_randint(10, 25),
              "max_features": sp_randint(5, 15),
              "min_samples_split": sp_randint(2, 10),
              "bootstrap": [False],
              "criterion": ["gini", "entropy"],
             "random_state":sp_randint(0, 13)}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5)

start = time()
random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)




RandomizedSearchCV took 427.81 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.919 (std: 0.013)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 13, 'max_features': 11, 'min_samples_split': 2, 'random_state': 12}

Model with rank: 2
Mean validation score: 0.918 (std: 0.015)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 12, 'min_samples_split': 7, 'random_state': 5}

Model with rank: 3
Mean validation score: 0.917 (std: 0.013)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 11, 'max_features': 9, 'min_samples_split': 5, 'random_state': 10}

Model with rank: 3
Mean validation score: 0.917 (std: 0.011)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 18, 'max_features': 9, 'min_samples_split': 5, 'random_state': 1}



Local test with best parameters from grid search

In [16]:
#try param of gridsearch
rd_forest_clf = RandomForestClassifier(n_estimators=200, bootstrap= False, criterion='entropy', max_depth= 18, max_features= 5,min_samples_split= 2, random_state=11).fit(X_train,Y_train)
predicted_rd_forest_local = rd_forest_clf.predict(X_test)
print(metrics.f1_score(Y_test,predicted_rd_forest_local,average='micro'))

0.9099836333878888


<h3>Convert results to csv</h3>

In [17]:
def convert(output):
    output_df = pd.DataFrame(output, columns=['prediction'])
    output_df.index = range(1,len(output_df)+1)
    output_df.index.name = 'RowId'
    return output_df

def save(output_df, name):
    output_df.to_csv('./out_'+name+'.csv')

Convert results of Decision Tree

In [18]:
predicted_dt_converted = convert(predicted_dt)
save(predicted_dt_converted, "decisionTree_prediction")
predicted_dt_converted.head()

Unnamed: 0_level_0,prediction
RowId,Unnamed: 1_level_1
1,http://eu.battle.net/sc2/en/profile/3538115/1/...
2,http://eu.battle.net/sc2/en/profile/2896854/1/...
3,http://eu.battle.net/sc2/en/profile/3973341/1/...
4,http://eu.battle.net/sc2/en/profile/250458/1/V...
5,http://eu.battle.net/sc2/en/profile/1139573/1/...


Convert results of Random Forest

In [19]:
predicted_rd_fr_converted = convert(predicted_rd_forest)
save(predicted_rd_fr_converted, "rndForest_prediction")
predicted_rd_fr_converted.head()

Unnamed: 0_level_0,prediction
RowId,Unnamed: 1_level_1
1,http://eu.battle.net/sc2/en/profile/3538115/1/...
2,http://eu.battle.net/sc2/en/profile/2896854/1/...
3,http://eu.battle.net/sc2/en/profile/3973341/1/...
4,http://eu.battle.net/sc2/en/profile/250458/1/V...
5,http://eu.battle.net/sc2/en/profile/950504/1/G...
