In [1]:
import warnings
import os
import json
import datetime
import functools as fn
import collections as cc
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, ShuffleSplit, KFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

%matplotlib inline
pd.set_option('display.max_columns', 250)
warnings.filterwarnings('ignore')

In [2]:
train_features = pd.read_csv('./train_features.csv')
train_targets = pd.read_csv('./train_targets.csv')
test_features = pd.read_csv('./test_features.csv')

In [3]:
X_train = train_features.iloc[:, 1:]
y_train = train_targets['radiant_win'].astype('int')
X_test = test_features.iloc[:, 1:]

In [4]:
def complex_holdout_score(y, prediction):
    a_score = accuracy_score(y, prediction > 0.5)
    ra_score = roc_auc_score(y, prediction)
    print('Accuracy score(holdout): ', str(a_score))
    print('ROC AUC score(holdout): ', str(ra_score))
    return {'Accuracy': a_score, 'ROC AUC': ra_score}


def complex_cv_score(cv_score):
    cv_mean = cv_score.mean()
    cv_std = cv_score.std()
    print('ROC AUC mean score(cv): ', str(cv_mean))
    print('ROC AUC score std(cv): ', str(cv_std))
    return cv_mean, cv_std


def compare_cv_scores(score_1, score_2):
    return score_2 > score_1


try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ('Please install ujson to read JSON oblects faster')
    
try:
    from tqdm import tqdm_notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ('Please install tqdm to track progress with Python loops')

    
def read_matches(matches_file):
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)
            
def write_to_submission_file(predicted_labels, out_file, target='radiant_win_prob', index_label='match_id_hash'):
    predicted_df = pd.DataFrame(predicted_labels, index=test_features.iloc[:, 0], columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [5]:
def get_coefficients(model, model_type, data):
    coefs = None
    if model_type == 'lr':
        coefs = pd.DataFrame(model.coef_.T, data.columns)
        coefs.columns = ["coef"]
        coefs["abs"] = coefs.coef.apply(np.abs)
        coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)    
    if model_type == 'cb':
        coefs = pd.DataFrame({'feature_name': data.columns, 'importance': model.feature_importances_})
        coefs.sort_values(by=['importance'], inplace=True, ascending=False)
    
    return coefs

In [6]:
def model_encoder(code):
    if code == 'lr':
        return 'logistic regression(sklearn)'
    elif code == 'rf':
        return 'random forest(sklearn)'
    elif code == 'cb':
        return 'gradient boosting(Catboost)'
    
    
def train_model(X, y, model_type='lr', params={}, test_size=0.3, cv=None, model=None, gs=False, gs_params={}, random_state=1):
    print('Selected model type: {}'.format(model_encoder(model_type)))
    
    if cv == None:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    if gs:
        pass
    
    cv_score = None
    cv_mean = None
    holdout_score = None
    coefs = 0
    if model_type == 'lr':
        model = LogisticRegression(random_state=random_state, solver='liblinear', **params)
        model.fit(X_train, y_train)
        prediction = model.predict_proba(X_valid)[:, 1]
        holdout_score = complex_holdout_score(y_valid, prediction)
        cv_score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
        cv_mean, cv_std = complex_cv_score(cv_score)
        coefs = get_coefficients(model, model_type, X_train)
    if model_type == 'cb':
        model = CatBoostClassifier(random_seed=random_state, silent=True, **params)
        model.fit(X_train, y_train);
        prediction = model.predict_proba(X_valid)[:, 1]
        holdout_score = complex_holdout_score(y_valid, prediction)
        cv_score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
        cv_mean, cv_std = complex_cv_score(cv_score)
        coefs = get_coefficients(model, model_type, X_train)
    
    return holdout_score, cv_score, coefs

In [7]:
def add_new_features(df_features, matches_file):
    radiant_tower_kills = []
    dire_tower_kills = []
    for match in read_matches(matches_file):
        tmp_radiant_tower_kills = 0
        tmp_dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    tmp_radiant_tower_kills += 1
                if objective['team'] == 3:
                    tmp_dire_tower_kills += 1
        radiant_tower_kills.append(tmp_radiant_tower_kills)
        dire_tower_kills.append(tmp_dire_tower_kills)
        
    df_features['radiant_tower_kills'] = radiant_tower_kills
    df_features['dire_tower_kills'] = dire_tower_kills
    df_features['diff_tower_kills'] = df_features['radiant_tower_kills'] - df_features['dire_tower_kills']

In [8]:
add_new_features(X_train, 'train_matches.jsonl')
add_new_features(X_test, 'test_matches.jsonl')

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [9]:
gold_df = pd.DataFrame()
r_columns = [f'r{i}_gold' for i in range(1, 6)]
d_columns = [f'd{i}_gold' for i in range(1, 6)]

gold_df['r_total_gold'] = train_features[r_columns].sum(1)
gold_df['d_total_gold'] = train_features[d_columns].sum(1)
gold_df['total_gold_ratio'] = gold_df['r_total_gold'] / gold_df['d_total_gold']
gold_df = gold_df.fillna(0)

In [10]:
X_train = pd.concat([X_train, gold_df.loc[:, ['total_gold_ratio']]], axis=1)

In [11]:
gold_df = pd.DataFrame()
r_columns = [f'r{i}_gold' for i in range(1, 6)]
d_columns = [f'd{i}_gold' for i in range(1, 6)]

gold_df['r_total_gold'] = test_features[r_columns].sum(1)
gold_df['d_total_gold'] = test_features[d_columns].sum(1)
gold_df['total_gold_ratio'] = gold_df['r_total_gold'] / gold_df['d_total_gold']
gold_df = gold_df.fillna(0)

In [12]:
X_test = pd.concat([X_test, gold_df.loc[:, ['total_gold_ratio']]], axis=1)

In [13]:
xp_df = pd.DataFrame()
r_columns = [f'r{i}_xp' for i in range(1, 6)]
d_columns = [f'd{i}_xp' for i in range(1, 6)]

xp_df['r_total_xp'] = train_features[r_columns].sum(1)
xp_df['d_total_xp'] = train_features[d_columns].sum(1)
xp_df['total_xp_ratio'] = xp_df['r_total_xp'] / xp_df['d_total_xp']
xp_df = xp_df.fillna(0)

In [14]:
X_train = pd.concat([X_train, xp_df.loc[:, ['total_xp_ratio']]], axis=1)

In [15]:
xp_df = pd.DataFrame()
r_columns = [f'r{i}_xp' for i in range(1, 6)]
d_columns = [f'd{i}_xp' for i in range(1, 6)]

xp_df['r_total_xp'] = test_features[r_columns].sum(1)
xp_df['d_total_xp'] = test_features[d_columns].sum(1)
xp_df['total_xp_ratio'] = xp_df['r_total_xp'] / xp_df['d_total_xp']
xp_df = xp_df.fillna(0)

In [16]:
X_test = pd.concat([X_test, xp_df.loc[:, ['total_xp_ratio']]], axis=1)

In [17]:
%%time
cat_params = {
    'cat_features': [1, 2]
}

cb_holdout_score, cv_score, cb_coefs = train_model(X_train, y_train, model_type='cb', params=cat_params)
cb_coefs.head(30)

Selected model type: gradient boosting(Catboost)
Accuracy score(holdout):  0.7279677392254054
ROC AUC score(holdout):  0.8188527025218526
ROC AUC mean score(cv):  0.8240870216069741
ROC AUC score std(cv):  0.0019844209121544195
CPU times: user 3min 15s, sys: 8.64 s, total: 3min 23s
Wall time: 2min 32s


Unnamed: 0,feature_name,importance
248,total_gold_ratio,12.376007
249,total_xp_ratio,6.476346
247,diff_tower_kills,2.574544
149,d2_hero_id,1.231482
246,dire_tower_kills,1.166449
101,r5_hero_id,1.096733
183,d3_max_mana,1.081994
77,r4_hero_id,1.064837
53,r3_hero_id,1.05183
173,d3_hero_id,1.02384


In [18]:
model = CatBoostClassifier(random_seed=1, silent=True, cat_features=[1, 2])
model.fit(X_train, y_train);
prediction = model.predict_proba(X_test)[:, 1] 

In [19]:
write_to_submission_file(prediction, './first.csv')