In [None]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [4]:
data = pd.read_csv('bs1_tourn.csv')

In [5]:
data.head()

Unnamed: 0,tournament_key,tournament_name,date,questionQty
0,4628,Семь сорок,2020-12-30T16:00:00+03:00,"{'1': 12, '2': 12, '3': 12}"
1,4772,Синхрон северных стран. Зимний выпуск,2019-01-05T19:00:00+03:00,"{'1': 12, '2': 12, '3': 12}"
2,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,"{'1': 13, '2': 13, '3': 13}"
3,4973,Балтийский Берег. 3 игра,2019-01-25T19:05:00+03:00,"{'1': 12, '2': 12, '3': 12}"
4,4974,Балтийский Берег. 4 игра,2019-03-01T19:05:00+03:00,"{'1': 12, '2': 12, '3': 12}"


In [6]:
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder().fit(data['tournament_key'].to_frame())

In [8]:
with open('results.pkl', 'rb') as f:
    results = pickle.load(f)

In [9]:
def is_good(tournament):
    flg = True
    for team in tournament:
        if (team['teamMembers'] == []) or (team.get('mask') is None):
            flg = False
    return flg

In [None]:
results_corrected  = {i : results[i] for i in results.keys() if is_good(results[i])}

In [None]:
results2  = {i : results_corrected[i] for i in results_corrected.keys() if i in data.tournament_key.unique()}

In [7]:
df_tp = pd.DataFrame(columns = ['tournament_key', 'team_id', 'player_id'])

for key in tqdm(results2.keys()):
    for item in results2[key]:
        for subitem in item['teamMembers']:
            df_tp = df_tp.append({
                            'tournament_key' : key,
                            'team_id' : item['team']['id'], 
                            'player_id': subitem['player']['id']}, ignore_index=True)

In [70]:
df_tp.to_csv('bs2_players.csv', header=True, index=False)

In [8]:
df_tp = pd.read_csv('bs2_players.csv')

In [9]:
df_tp.head()

Unnamed: 0,tournament_key,team_id,player_id
0,4772,45556,6212
1,4772,45556,18332
2,4772,45556,18036
3,4772,45556,22799
4,4772,45556,15456


In [10]:
df = pd.read_csv('bs1_data.csv')
df['mask'] = df['mask'].apply(lambda x: list(map(int, str(x).replace('X', '0').replace('?', '0'))))
df = df[['tournament_key', 'team_id', 'mask', 'questionsTotal', 'position']]
df.head()

Unnamed: 0,tournament_key,team_id,mask,questionsTotal,position
0,4772,45556,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",28,1.0
1,4772,1030,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",25,5.5
2,4772,4252,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...",25,5.5
3,4772,5444,"[1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",25,5.5
4,4772,40931,"[1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",25,5.5


In [11]:
df_w = pd.read_csv('bs1_weights.csv')

df_w['weights_arr'] = df_w['weights_arr'].fillna('[0]')
df_w['weights_arr'] = df_w['weights_arr'].apply(lambda x: x.replace('\n', '')[1:-1].split())

arr = []
for i in range(df_w.shape[0]):
    val = len(df_w[i:i+1]['weights_arr'][i])
    arr.append(val)
df_w['len'] = pd.Series(arr)

df_w['tournament_key'] = df_w['tournament_key'].astype(int)

df_w.head()

Unnamed: 0,tournament_key,weights_arr,len
0,4628,[0],1
1,4772,"[0.88311688, 0.77922078, 0.44588745, 0.5194805...",36
2,4957,"[0.2826087, 0.11956522, 0.85869565, 0.57608696...",39
3,4973,"[0.56183057, 0.80331061, 0.77312561, 0.5394352...",36
4,4974,"[0.3164557, 0.48880234, 0.55890944, 0.57935735...",36


In [12]:
df_w_tourn = explode(df, ['mask'], fill_value='').groupby(['tournament_key'])['mask'].mean().reset_index()

In [13]:
df_w_tourn.columns = ['tournament_key', 'avg_tournament']

In [14]:
df_w = df_w.merge(df_w_tourn, how='left')

In [15]:
df_w.head()

Unnamed: 0,tournament_key,weights_arr,len,avg_tournament
0,4628,[0],1,
1,4772,"[0.88311688, 0.77922078, 0.44588745, 0.5194805...",36,0.46633
2,4957,"[0.2826087, 0.11956522, 0.85869565, 0.57608696...",39,0.330546
3,4973,"[0.56183057, 0.80331061, 0.77312561, 0.5394352...",36,0.468409
4,4974,"[0.3164557, 0.48880234, 0.55890944, 0.57935735...",36,0.462323


Как часто мигрируют игроки?

In [82]:
migr = df_tp[['team_id', 'player_id']].drop_duplicates().groupby('player_id').team_id.count().reset_index()
migr.sort_values(by='team_id', ascending=False)

Unnamed: 0,player_id,team_id
2455,16939,69
6636,54035,64
10473,96304,56
5887,42511,54
3315,23020,52
...,...,...
26023,178127,1
26024,178128,1
26025,178132,1
26026,178135,1


In [89]:
migr[migr['team_id'] > 1].shape[0], migr[migr['team_id'] > 3].shape[0]

(13910, 4428)

In [50]:
df_tp[df_tp['player_id'] == 15].head(3)

Unnamed: 0,tournament_key,team_id,player_id
4540,4973,51584,15
57299,5128,51584,15
101090,5393,48051,15


In [91]:
data.shape

(1109, 4)

In [93]:
print(data['questionQty'].value_counts().head(10).sum())
data['questionQty'].value_counts().head(10)

974


{'1': 12, '2': 12, '3': 12}                               598
{'1': 15, '2': 15, '3': 15}                                86
{'1': 13, '2': 13, '3': 13}                                54
{'1': 12, '2': 12, '3': 12, '4': 12, '5': 12, '6': 12}     41
{'1': 12, '2': 12, '3': 12, '4': 12}                       40
{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, '6': 15}     39
{'1': 18, '2': 18}                                         32
{'1': 15, '2': 15, '3': 15, '4': 15}                       30
{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15}              29
{'1': 12, '2': 12}                                         25
Name: questionQty, dtype: int64

In [111]:
# df[df['team_id'] == 45556].merge(df_w, on='tournament_key')

In [126]:
from sklearn.linear_model import LinearRegression

In [127]:
# LogisticRegression(fit_intercept=False)

In [51]:
teams = df['team_id'].unique()
# teams = [45556,  1030,  4252]

# res ={}
for team_id in tqdm(teams):
    df_sample = pd.DataFrame(); df_team = pd.DataFrame(); 
    train_team = pd.DataFrame(); test_team = pd.DataFrame(); 
    #join questions level (avg)
    df_sample = df[df['team_id'] == team_id].merge(df_w, on='tournament_key')
    
    #transform data
#     df_sample['mask'] = df_sample['mask'].apply(lambda x: list(map(int, str(x).replace('X', '0').replace('?', '0'))))
    if df_sample[df_sample['len'] > 1].shape[0] != 0:
        df_team = explode(df_sample[df_sample['len'] > 1], ['mask', 'weights_arr'], fill_value='')
        df_team = df_team[['team_id', 'tournament_key', 'mask', 'weights_arr']]\
                        .merge(data[['tournament_key', 'date']], on='tournament_key')

        #train-test
        train_team = df_team[df_team['date'] < '2020']
        test_team = df_team[df_team['date'] >= '2020']
    
    if train_team.shape[0] != 0:
        
        lr = LinearRegression(fit_intercept=False)
        lr.fit(train_team['weights_arr'].values.reshape(-1, 1), train_team['mask'].values)
        res.update({team_id: lr.coef_[0]})

In [174]:
lr_ser = pd.DataFrame(columns = ['team_id', 'w1'])

for key, weight in res.items():
#     print(i, k)
    lr_ser = lr_ser.append({
                        'team_id' : key,
                        'w1' : weight
                       }, ignore_index=True)

In [175]:
len(df.team_id.unique())

12099

In [176]:
lr_ser.head()

Unnamed: 0,team_id,w1
0,45556.0,1.459771
1,1030.0,1.268769
2,4252.0,1.294882
3,5444.0,1.137762
4,40931.0,1.35843


In [196]:
df_tp[df_tp['tournament_key'] == 6456][['tournament_key', 'team_id']].drop_duplicates()

Unnamed: 0,tournament_key,team_id
19270478,6456,71625
19270712,6456,27285
19270946,6456,55612
19271180,6456,68457
19271219,6456,43261
19271414,6456,69918
19271648,6456,63129


In [185]:
ans = data[data['date'] >= '2020'].merge(
                                    df_tp[['tournament_key', 'team_id']].drop_duplicates()\
                                            .merge(lr_ser, on='team_id', how='left'),
                                    on='tournament_key', how='left')

In [215]:
res_fin = []
for key in tqdm(ans.tournament_key.unique()):
    ans_sample = ans[ans['tournament_key'] == key].sort_values(by=['tournament_key', 'w1'], ascending=False)
    ans_sample['position_pred'] = np.arange(len(ans_sample))+1
    ans_sample = ans_sample.merge(df[['tournament_key', 'team_id', 'position']], on=['tournament_key', 'team_id'])
    res_fin.append(scipy.stats.spearmanr(ans_sample['position'], ans_sample['position_pred'])[0])
    







  0%|          | 0/422 [00:00<?, ?it/s][A[A[A[A[A[A





  0%|          | 2/422 [00:00<00:21, 19.14it/s][A[A[A[A[A[A





  1%|▏         | 6/422 [00:00<00:18, 22.11it/s][A[A[A[A[A[A





  2%|▏         | 10/422 [00:00<00:16, 25.08it/s][A[A[A[A[A[A





  3%|▎         | 14/422 [00:00<00:14, 27.23it/s][A[A[A[A[A[A





  4%|▍         | 17/422 [00:00<00:14, 27.60it/s][A[A[A[A[A[A





  5%|▍         | 21/422 [00:00<00:14, 28.53it/s][A[A[A[A[A[A





  6%|▌         | 25/422 [00:00<00:13, 30.25it/s][A[A[A[A[A[A





  7%|▋         | 29/422 [00:00<00:12, 32.10it/s][A[A[A[A[A[A





  8%|▊         | 33/422 [00:01<00:12, 32.36it/s][A[A[A[A[A[A





  9%|▉         | 37/422 [00:01<00:12, 31.47it/s][A[A[A[A[A[A





 10%|▉         | 41/422 [00:01<00:11, 31.84it/s][A[A[A[A[A[A





 11%|█         | 45/422 [00:01<00:11, 32.69it/s][A[A[A[A[A[A





 12%|█▏        | 49/422 [00:01<00:11, 33.65it/s][A[A[A[A[A[A




In [210]:
# ans_sample = ans[ans['tournament_key'] == 6456].sort_values(by=['tournament_key', 'w1'], ascending=False).head(15)
# ans_sample['position_pred'] = np.arange(len(ans_sample))+1
# ans_sample = ans_sample.merge(df[['tournament_key', 'team_id', 'position']], on=['tournament_key', 'team_id'])
# spearmanr(ans_sample['position'], ans_sample['position_pred'])

In [219]:
len(res_fin)

422

In [221]:
res_fin = [x for x in res_fin if str(x) != 'nan']
len(res_fin)

163

In [222]:
np.mean(res_fin)

0.6505723868480768

In [225]:
res_fin = []
for key in tqdm(ans.tournament_key.unique()):
    ans_sample = ans[ans['tournament_key'] == key].sort_values(by=['tournament_key', 'w1'], ascending=False)
    ans_sample['position_pred'] = np.arange(len(ans_sample))+1
    ans_sample = ans_sample.merge(df[['tournament_key', 'team_id', 'position']], on=['tournament_key', 'team_id'])
    res_fin.append(scipy.stats.kendalltau(ans_sample['position'], ans_sample['position_pred'])[0])







  0%|          | 0/422 [00:00<?, ?it/s][A[A[A[A[A[A





  1%|          | 3/422 [00:00<00:14, 28.65it/s][A[A[A[A[A[A





  2%|▏         | 7/422 [00:00<00:13, 30.56it/s][A[A[A[A[A[A





  3%|▎         | 11/422 [00:00<00:12, 32.54it/s][A[A[A[A[A[A





  4%|▎         | 15/422 [00:00<00:11, 34.11it/s][A[A[A[A[A[A





  5%|▍         | 19/422 [00:00<00:11, 35.44it/s][A[A[A[A[A[A





  5%|▌         | 23/422 [00:00<00:11, 33.75it/s][A[A[A[A[A[A





  6%|▋         | 27/422 [00:00<00:11, 33.89it/s][A[A[A[A[A[A





  7%|▋         | 31/422 [00:00<00:11, 34.70it/s][A[A[A[A[A[A





  8%|▊         | 35/422 [00:01<00:13, 29.36it/s][A[A[A[A[A[A





  9%|▉         | 38/422 [00:01<00:13, 27.62it/s][A[A[A[A[A[A





 10%|▉         | 41/422 [00:01<00:13, 27.89it/s][A[A[A[A[A[A





 11%|█         | 45/422 [00:01<00:13, 28.96it/s][A[A[A[A[A[A





 11%|█▏        | 48/422 [00:01<00:14, 25.77it/s][A[A[A[A[A[A




In [226]:
res_fin = [x for x in res_fin if str(x) != 'nan']
len(res_fin)

163

In [227]:
np.mean(res_fin)

0.51081376028116

In [200]:
df[['tournament_key', 'team_id', 'position']].head()

Unnamed: 0,tournament_key,team_id,position
0,4772,45556,1.0
1,4772,1030,5.5
2,4772,4252,5.5
3,4772,5444,5.5
4,4772,40931,5.5


Ряд турниров пропал, поэтому попробуем представить команды или игроков через OHE и обучить на всей выборке

In [16]:
df_fin = df.merge(df_w, on='tournament_key')
df_fin = df_fin[df_fin['len'] > 1]
df_fin.shape[0]

91128

In [17]:
df_fin = df_fin[['tournament_key', 'team_id', 'mask', 'weights_arr', 'avg_tournament']]\
            .merge(data[['tournament_key', 'date']], on='tournament_key', how='inner')

In [18]:
df_fin = explode(df_fin, ['mask', 'weights_arr'], fill_value='')

In [19]:
df_fin = df_fin.merge(df_tp, on=['tournament_key', 'team_id'])

In [20]:
df_fin.head()

Unnamed: 0,avg_tournament,date,team_id,tournament_key,mask,weights_arr,player_id
0,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,6212
1,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,18332
2,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,18036
3,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,22799
4,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,15456


In [22]:
df_fin.to_csv('bs2_final.csv', header=True, index=False)

In [22]:
df_fin[['player_id', 'date', 'avg_tournament', 'weights_arr', 'mask']].head()

Unnamed: 0,player_id,date,avg_tournament,weights_arr,mask
0,6212,2019-01-05T19:00:00+03:00,0.46633,0.88311688,1
1,18332,2019-01-05T19:00:00+03:00,0.46633,0.88311688,1
2,18036,2019-01-05T19:00:00+03:00,0.46633,0.88311688,1
3,22799,2019-01-05T19:00:00+03:00,0.46633,0.88311688,1
4,15456,2019-01-05T19:00:00+03:00,0.46633,0.88311688,1


In [23]:
train = df_fin[df_fin['date'] < '2020']

In [24]:
train.head()

Unnamed: 0,avg_tournament,date,team_id,tournament_key,mask,weights_arr,player_id
0,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,6212
1,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,18332
2,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,18036
3,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,22799
4,0.46633,2019-01-05T19:00:00+03:00,45556,4772,1,0.88311688,15456


In [25]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(train['player_id'].to_frame())

In [29]:
from scipy.sparse import csr_matrix, hstack

In [30]:
train['weights_arr'] = train['weights_arr'].astype(float)

In [31]:
train[['avg_tournament', 'weights_arr']].dtypes

avg_tournament    float64
weights_arr       float64
dtype: object

In [32]:
X_train = hstack([csr_matrix(train[['avg_tournament', 'weights_arr']].values),
                    encoder.transform(train['player_id'].to_frame())])

y_train = train['mask']

In [33]:
from sklearn.linear_model import LogisticRegression

In [36]:
lr = LogisticRegression().fit(X_train, y_train)

In [47]:
df_rates = pd.DataFrame()
df_rates['players'] = pd.Series(encoder.get_feature_names())

In [48]:
lr.coef_.shape

(1, 53768)

In [49]:
df_rates['rates'] = pd.Series(lr.coef_[0][2:])

In [63]:
df_rates['players'] = df_rates['players'].str[3:].astype(int)

In [68]:
df_rates.columns = ['player_id', 'rates']

In [69]:
df_rates.head()

Unnamed: 0,player_id,rates
0,15,-0.242241
1,16,0.830317
2,23,0.143789
3,31,0.72621
4,35,0.722905


In [66]:
ans.columns

Index(['tournament_key', 'tournament_name', 'date', 'questionQty', 'team_id',
       'player_id'],
      dtype='object')

In [70]:
ans = data[data['date'] >= '2020'].merge(
                                    df_tp[['tournament_key', 'team_id', 'player_id']],
                                    on='tournament_key', how='left')\
                                        .merge(df_rates, on='player_id', how='left')

ans.head()

Unnamed: 0,tournament_key,tournament_name,date,questionQty,team_id,player_id,rates
0,4628,Семь сорок,2020-12-30T16:00:00+03:00,"{'1': 12, '2': 12, '3': 12}",,,
1,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,"{'1': 13, '2': 13, '3': 13}",49804.0,30152.0,2.127926
2,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,"{'1': 13, '2': 13, '3': 13}",49804.0,30270.0,2.604733
3,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,"{'1': 13, '2': 13, '3': 13}",49804.0,27822.0,3.041757
4,4957,Синхрон Биркиркары,2020-02-21T00:00:00+03:00,"{'1': 13, '2': 13, '3': 13}",49804.0,28751.0,2.899224


In [72]:
ans_gr = ans[['tournament_key', 'team_id', 'rates']].groupby(['tournament_key', 'team_id']).rates.sum().reset_index()
ans_gr.head()

Unnamed: 0,tournament_key,team_id,rates
0,4957,2.0,11.114558
1,4957,84.0,7.07484
2,4957,312.0,8.429892
3,4957,928.0,11.170735
4,4957,1799.0,4.783479


In [74]:
import scipy

In [75]:
res_fin = []
for key in tqdm(ans.tournament_key.unique()):
    ans_sample = ans_gr[ans_gr['tournament_key'] == key].sort_values(by=['tournament_key', 'rates'], ascending=False)
    ans_sample['position_pred'] = np.arange(len(ans_sample))+1
    ans_sample = ans_sample.merge(df[['tournament_key', 'team_id', 'position']], on=['tournament_key', 'team_id'])
    res_fin.append(scipy.stats.spearmanr(ans_sample['position'], ans_sample['position_pred'])[0])

100%|██████████| 422/422 [00:09<00:00, 44.03it/s]


In [80]:
# res_fin

In [77]:
res_fin = [x for x in res_fin if str(x) != 'nan']
len(res_fin)

163

In [78]:
np.mean(res_fin)

0.7366978492721348

In [79]:
res_fin = []
for key in tqdm(ans.tournament_key.unique()):
    ans_sample = ans_gr[ans_gr['tournament_key'] == key].sort_values(by=['tournament_key', 'rates'], ascending=False)
    ans_sample['position_pred'] = np.arange(len(ans_sample))+1
    ans_sample = ans_sample.merge(df[['tournament_key', 'team_id', 'position']], on=['tournament_key', 'team_id'])
    res_fin.append(scipy.stats.kendalltau(ans_sample['position'], ans_sample['position_pred'])[0])

100%|██████████| 422/422 [00:11<00:00, 37.70it/s]


In [81]:
res_fin = [x for x in res_fin if str(x) != 'nan']
len(res_fin)

163

In [82]:
np.mean(res_fin)

0.5735228829637771