In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

### Loading DATA

In [None]:
data_train = pd.read_csv('../data/train_features.csv', index_col='match_id_hash')
# data_test = pd.read_csv('../data/test_features.csv', index_col='match_id_hash')
train_targets = pd.read_csv('../data/train_targets.csv', index_col='match_id_hash')
feature_afk = pd.read_csv('afk_feature.csv')
data_train_engineered = pd.read_csv('data_train_engineered.csv', index_col='match_id_hash')

### Features

In [None]:
# afk_features
data_train = data_train.join(feature_afk.set_index('match_id_hash'))
print(data_train.shape)

data_train['afk'] = data_train['afk'].fillna('0')

data_train.info()
data_train.head(3)
# [print(n) for n in list(train_targets.isna().sum()) if n != 0]  # Проверка на пропущенные данные


 ### Team stats 

In [5]:
import re

for col in list(data_train.columns):
    r_regexp = re.compile(r'r[0-9]_(.*)').findall(col)
    d_regexp = re.compile(r'd[0-9]_(.*)').findall(col)

    if len(r_regexp) > 0:
        data_train['r_' + r_regexp[0]] = 0
    if len(d_regexp) > 0:
        data_train['d_' + d_regexp[0]] = 0  

for col in list(data_train.columns):
    r_regexp = re.compile(r'r[0-9]_(.*)').findall(col)
    d_regexp = re.compile(r'd[0-9]_(.*)').findall(col)

    if len(r_regexp) > 0:
        data_train['r_' + r_regexp[0]] = data_train['r_' + r_regexp[0]] + data_train[re.compile(r'r[0-9]_.*').findall(col)[0]]
        data_train.drop(columns=[re.compile(r'r[0-9]_.*').findall(col)[0]], inplace=True)
        
    if len(d_regexp) > 0:
        data_train['d_' + d_regexp[0]] = data_train['d_' + d_regexp[0]] + data_train[re.compile(r'd[0-9]_.*').findall(col)[0]]
        data_train.drop(columns=[re.compile(r'd[0-9]_.*').findall(col)[0]], inplace=True)

data_train.drop(columns=['r_hero_id', 'd_hero_id', 'r_x', 'r_y', 'd_x', 'd_y'], inplace=True)

In [6]:
# Проверяем, есть ли NA
[print(n) for n in list(data_train.isna().sum()) if n != 0]

[]

In [7]:
# data_train.to_csv('data_train_engineered.csv')


In [19]:
len(list(data_train.columns))

48

## Features Engineering from raw data

In [2]:
import json
matches = []
with open('../data/train_matches.jsonl') as fin:
    # read the 18-th line
    for i in range(500):
        line = fin.readline()
        matches.append(json.loads(line))
    # read JSON into a Python object 
    

In [13]:
matches[0]['players'][i]['hero_inventory']

[{'id': 'item_tango', 'num_charges': 2, 'cooldown': 0},
 {'id': 'item_wraith_band', 'cooldown': 0},
 {'id': 'item_enchanted_mango', 'cooldown': 0},
 {'id': 'item_clarity', 'num_charges': 1, 'cooldown': 0},
 {'id': 'item_tpscroll', 'num_charges': 1, 'cooldown': 0}]

In [15]:
afk_players = pd.DataFrame()
for match in matches:
    for i, game in enumerate(match):
        if len(match['players'][i]['actions']) < 3:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))
        # else:
        #     afk_players = afk_players.append(pd.DataFrame(data={'afk_true': 0}, index=[match['match_id_hash']]))
            # print(f"player_slot {match['players'][i]['player_slot']} in match {match['match_id_hash']} was AFK")
afk_players

Unnamed: 0,match_id_hash,afk
0,6db558535151ea18ca70a6892197db41,1
5,6db558535151ea18ca70a6892197db41,1
5,7e9ee83a71001fec75bacbe4d61b2724,1
1,cb8782d6c6bc0398c228625c20617d04,1


In [10]:
afk_players = afk_players.groupby('match_id_hash').sum()
afk_players

Unnamed: 0_level_0,afk
match_id_hash,Unnamed: 1_level_1
6db558535151ea18ca70a6892197db41,2
7e9ee83a71001fec75bacbe4d61b2724,1
cb8782d6c6bc0398c228625c20617d04,1


In [94]:
%time
import os

try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ('Please install ujson to read JSON oblects faster')
    
try:
    from tqdm import tqdm_notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ('Please install tqdm to track progress with Python loops')

def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)
            
afk_players = pd.DataFrame()   
# processing each game
for match in read_matches('../data/train_matches.jsonl'):
    
   # processing each player
    for i, player in enumerate(match['players']):
        if len(match['players'][i]['actions']) < 5:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))

Wall time: 0 ns



HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))

In [95]:
afk_players = afk_players.groupby('match_id_hash').sum().reset_index()
print(afk_players.info())
afk_players.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679 entries, 0 to 678
Data columns (total 2 columns):
match_id_hash    679 non-null object
afk              679 non-null int64
dtypes: int64(1), object(1)
memory usage: 10.7+ KB
None


Unnamed: 0,match_id_hash,afk
0,004f258cb0aec3c2612ec04ab4544d6a,1
1,0073eaa0a66ec205f8a22c0550454085,1
2,00e163a7f515c783325582b020232a7d,1
3,02368d70de1312c29776b51f539da917,1
4,02f1484e8e50a21c186431f8492bf6d3,1


In [110]:
afk_players.set_index('match_id_hash').to_csv('afk_feature.csv')

## Teaching models

In [90]:
X = data_train[data_train['game_time'] >= 600]#.drop(
    # ['d1_x', 'd1_y', 'd2_x', 'd2_y', 'd3_x', 'd3_y', 'd4_x', 'd4_y', 'd5_x','d5_y','r1_x', 'r1_y', 'r2_x', 'r2_y', 'r3_x', 'r3_y', 'r4_x', 'r4_y', 
    #  'r5_x', 'r5_y'], axis=1)

y = train_targets[train_targets['game_time'] >= 600]['radiant_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [91]:
# scaler = StandardScaler()
# 
# X_train_norm = scaler.fit_transform(X_train)
y_train = y_train.replace({False: 0, True: 1})

In [92]:
%%time
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=22, n_estimators=500)
rfc.fit(X=X_train, y=y_train)
y_pred = rfc.predict(X_test)
roc_auc_score(y_true=y_test, y_score=y_pred)

Wall time: 25.1 s


0.7584150217439245

In [93]:
%%time
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
# knc.predict_proba(X_test)
roc_auc_score(y_test, knc.predict(X_test))

Wall time: 980 ms


0.7339611671748086

In [94]:
%%time
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
roc_auc_score(y_test, qda.predict(X_test))

Wall time: 93 ms


0.7399817298005074

In [95]:
%%time
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=22, solver='lbfgs', max_iter=2000)
lr.fit(X_train, y_train)
roc_auc_score(y_test, lr.predict(X_test))

Wall time: 3.24 s


0.7672925059098985

### Grid Search

In [111]:
best_scores_list = []

In [115]:
%%time
from sklearn.model_selection import GridSearchCV, StratifiedKFold
for i in range(3):
    i += 1
    # rfc = RandomForestClassifier(n_estimators = 50, n_jobs=-1, random_state=22, oob_score=True, class_weight='balanced')
    logistic_reg = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1500, n_jobs=-1)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    # parameters = {'max_features': [5], 'min_samples_leaf': [10], 'max_depth': [37]}
    parameters = {'C': [0.75], 'tol': [0.25]}
    gsc = GridSearchCV(estimator=logistic_reg, cv=skf, param_grid=parameters, n_jobs=-1, scoring='roc_auc')
    gsc.fit(X_train, y_train)
    print(gsc.best_score_)#, gsc.best_params_)
    best_scores_list.append(gsc.best_score_)
    

0.853165647904147
0.8537117350876302
0.8534438231942715
Wall time: 1min 10s


In [114]:
best_scores_list

[]

# Logistic Regression
params:
{class_weight='balanced', random_state=22, solver='lbfgs', max_iter=1500, n_jobs=-1, {'C': 0.75, 'tol': 0.25}}

In [106]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight='balanced', random_state=22, solver='lbfgs', max_iter=1500, n_jobs=-1, C=0.75, tol=0.25)
lr.fit(X_train, y_train)
roc_auc_score(y_test, lr.predict(X_test))

0.7668417186964427

In [None]:
# pd.DataFrame(rfc.feature_importances_, X_train.columns.values, columns = ['Coef']).sort_values(by='Coef', ascending=False)
X_train