In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np
import pandas as pd

import scipy as sp
import scipy.stats as st
from scipy.special import expit as sigmoid
import scipy.stats as sts
from scipy import sparse
from scipy.stats import multivariate_normal

from sklearn.linear_model import LogisticRegression
from copy import copy
from IPython.display import Image
import pickle
from datetime import datetime, timedelta
import json
import gc
import re
from tqdm import tqdm
from collections import defaultdict, Counter


sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (15,8)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('figure', **{'dpi': 300})

# 1. Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах

In [2]:
results = pickle.load(open("chgk/results.pkl", "rb"))
tournaments_df = pd.DataFrame(pickle.load(open("chgk/tournaments.pkl", "rb"))).transpose()
tournaments_df.dateStart = pd.to_datetime(tournaments_df.dateStart)
players_df = pd.DataFrame(pickle.load(open("chgk/players.pkl", "rb"))).transpose()

In [3]:
tournaments_df.head()

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
1,1,Чемпионат Южного Кавказа,2003-07-25 00:00:00+04:00,2003-07-27T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/1,[],,
2,2,Летние зори,2003-08-09 00:00:00+04:00,2003-08-09T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/1,[],,
3,3,Турнир в Ижевске,2003-11-22 00:00:00+03:00,2003-11-24T00:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,
4,4,Чемпионат Украины. Переходной этап,2003-10-11 00:00:00+04:00,2003-10-12T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,
5,5,Бостонское чаепитие,2003-10-10 00:00:00+04:00,2003-10-13T00:00:00+04:00,"{'id': 2, 'name': 'Обычный'}",/seasons/2,[],,


In [4]:
players_df.head()

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова
12,12,Артур,Евгеньевич,Абальян
13,13,Эрик,Евгеньевич,Абальян


In [5]:
train_start_date = pd.to_datetime("2019/01/01 00:00:00", utc=True)
train_end_date = pd.to_datetime("2019/12/31 23:59:59", utc=True)
train_tournaments_index = tournaments_df[tournaments_df.dateStart.between(train_start_date, train_end_date)].id
train = []

for tour_id in train_tournaments_index:
    tournament = results.get(tour_id, -1)
    if tournament == -1:
        continue
        
    for team in tournament:
        mask = team.get("mask", "")
        if not mask or re.sub("[0-1]+", "", mask):
            continue
                  
        team_id = int(team["team"]["id"])
        
        for member in team["teamMembers"]:
            player_id = member['player']['id']
            for quest_idx, answ in enumerate([int(i) for i in mask]):
                train.append([tour_id, team_id, player_id, str(tour_id) + "_" + str(quest_idx), int(answ)])
    

df_train = pd.DataFrame(train)
df_train.columns = ['tournament_id', 'team_id', 'player_id', 'question_id', 'answer']


In [6]:
df_train.head()

Unnamed: 0,tournament_id,team_id,player_id,question_id,answer
0,4772,45556,6212,4772_0,1
1,4772,45556,6212,4772_1,1
2,4772,45556,6212,4772_2,1
3,4772,45556,6212,4772_3,1
4,4772,45556,6212,4772_4,1


In [7]:
test_start_date = pd.to_datetime("2020/01/01 00:00:00", utc=True)
test_end_date = pd.to_datetime("2020/12/31 23:59:59", utc=True)
test_tournaments_index = tournaments_df[tournaments_df.dateStart.between(test_start_date, test_end_date)].id
test = []
test_position = []
for tour_id in test_tournaments_index:
    tournament = results.get(tour_id, -1)
    if tournament == -1:
        continue
    
    for team in tournament:
        mask = team.get("mask", "")
        if not mask or re.sub("[0-1]+", "", mask):
            continue
        team_id = int(team["team"]["id"])
        test_position.append([tour_id, team_id, team["position"]])
        for member in team["teamMembers"]:
            player_id = member['player']['id']
            for quest_idx, answ in enumerate([int(i) for i in mask]):
                test.append([tour_id, team_id, player_id, str(tour_id) + "_" + str(quest_idx), int(answ)])
    

    

In [8]:
df_test_position = pd.DataFrame(test_position)
df_test_position.columns = ['tournament_id', 'team_id', 'position']

In [9]:
df_test_position.head()

Unnamed: 0,tournament_id,team_id,position
0,5414,66120,1.0
1,5414,312,2.5
2,5414,2,2.5
3,5414,72338,5.0
4,5414,4032,5.0


In [10]:
df_test = pd.DataFrame(test)
df_test.columns = ['tournament_id', 'team_id', 'player_id', 'question_id', 'answer']

In [11]:
df_test.head()

Unnamed: 0,tournament_id,team_id,player_id,question_id,answer
0,5414,66120,18490,5414_0,1
1,5414,66120,18490,5414_1,1
2,5414,66120,18490,5414_2,1
3,5414,66120,18490,5414_3,1
4,5414,66120,18490,5414_4,1


# Baseline model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

In [13]:
one_hot = ColumnTransformer(
    transformers=[
            ('OneHot', OneHotEncoder(), ['tournament_id', 'player_id'])
    ],
        remainder='drop',
        sparse_threshold=1
    )
pipe = Pipeline(
        verbose=True,
        steps=[
            ('one_hot', one_hot),
            ('logreg', LogisticRegression(solver='liblinear', max_iter=100))
        ]
    )
pipe.fit(df_train[['tournament_id', 'player_id']], df_train['answer'])

[Pipeline] ........... (step 1 of 2) Processing one_hot, total=   4.3s
[Pipeline] ............ (step 2 of 2) Processing logreg, total= 5.1min


Pipeline(steps=[('one_hot',
                 ColumnTransformer(sparse_threshold=1,
                                   transformers=[('OneHot', OneHotEncoder(),
                                                  ['tournament_id',
                                                   'player_id'])])),
                ('logreg', LogisticRegression(solver='liblinear'))],
         verbose=True)

In [14]:
player_features_start_pos = df_train.nunique()['tournament_id']
player_features_names = pipe['one_hot'].get_feature_names()[player_features_start_pos:]
player_ids = [int(name[11:]) for name in player_features_names]
    



In [15]:
player_weights = pipe['logreg'].coef_[0][player_features_start_pos:]

In [16]:
map_id_to_weight = dict(zip(player_ids, player_weights))

In [17]:
map_id_to_weight = dict(sorted(map_id_to_weight.items(), key=lambda item: item[1], reverse=True))

In [18]:
i = 0
top_100 = []
for player_id, score in map_id_to_weight.items():
    if i >= 100:
        break
    player = players_df[players_df.id == player_id]
    player_name = player.name.values[0] + " " + player.patronymic.values[0] + " " + player.surname.values[0]
    top_100.append([player_name, score])
    i += 1

In [19]:
df_top_100 = pd.DataFrame(top_100)
df_top_100.columns = ['player', 'score']

In [20]:
print(df_top_100.to_string())

                                player     score
0              Максим Михайлович Руссо  3.051347
1       Александра Владимировна Брутер  2.960198
2              Иван Николаевич Семушин  2.930721
3         Павел Константинович Щербина  2.921337
4        Михаил Владимирович Савченков  2.838481
5             Артём Сергеевич Сорожкин  2.816663
6            Сергей Леонидович Спешков  2.814304
7            Михаил Ильич Левандовский  2.737535
8            Сергей Игоревич Николенко  2.718894
9          Антон Владимирович Саксонов  2.717776
10          Ирина Сергеевна Прокофьева  2.701985
11   Станислав Григорьевич Мереминский  2.690272
12          Александр Витальевич Либер  2.667702
13         Наталья Евгеньевна Горелова  2.658509
14      Александр Владимирович Мосягин  2.653666
15              Илья Сергеевич Новиков  2.641685
16             Сусанна Марковна Бровер  2.631253
17          Алексей Владимирович Гилёв  2.625826
18              Игорь Викторович Мокин  2.625558
19              Мари

# 3 Качество рейтинг системы

In [21]:
from scipy import stats

In [22]:
def get_prediction(tournament, player_to_weight, orginal_position_dict):
    predict_weight_dict = defaultdict(lambda : 0)
    avg_weight = np.mean([v for v in player_to_weight.values()])
    for idx, team in enumerate(tournament):
        weight = 0
        for player_info in team['teamMembers']:
            p_id = player_info['player']['id']
            try:
                weight += player_to_weight[p_id]
            except:
                weight += avg_weight
        predict_weight_dict[team["team"]["id"]] = weight
    predict_weight_dict = dict(sorted(predict_weight_dict.items(), key=lambda item: item[1], reverse=True))
    predict_position_dict = dict(zip(predict_weight_dict.keys(), range(1, len(predict_weight_dict) + 1)))
    return [predict_position_dict[team_id] for team_id in orginal_position_dict.keys()]

In [23]:
kendalltau_corr = []
spearmanr_corr = []

for tour_id in df_test.tournament_id.unique():
    tournament = results.get(tour_id, -1)
    if tournament == -1:
        continue
    
    original_position = [team['position'] for team in tournament]
    teams = [team["team"]["id"] for team in tournament]
    orginal_position_dict = dict(zip(teams, original_position))
    prediction_position = get_prediction(tournament, map_id_to_weight, orginal_position_dict)


    kendalltau_corr.append(stats.kendalltau(original_position, prediction_position).correlation)
    spearmanr_corr.append(stats.spearmanr(original_position, prediction_position).correlation)



In [24]:
np.mean(kendalltau_corr), np.mean(spearmanr_corr)

(0.6240738447088694, 0.7783292841670338)

# 4 EM

## M step

In [25]:
one_hot = ColumnTransformer(
    transformers=[('OneHot', OneHotEncoder(), ['player_id', 'question_id'])],
    remainder='drop',
    sparse_threshold=1
)

pipe = Pipeline(
    verbose=True,
    steps=[
        ('one_hot', one_hot),
        ('logreg', LogisticRegression(solver='saga', max_iter=100))
    ]
)

def m_step(model, X, y):
    model.fit(X, y)
    return model, model.predict(X)

## E step

In [None]:
def e_step(df, preds):
    df['new_target'] = preds
    label_zero_idx = df['target'] == 0
    df.loc[label_zero_idx, 'new_target'] = 0
    # изменяем только метки для вопросов, на которые команда ответила
    # поскольку p(z_ij = 1 | team_ij = 0) = 0 в силу предположений
    label_one_idx = df['target'] == 1
    e_step_denom = df.loc[label_one_idx].groupby(['team_id', 'question_id'])['new_target']
    e_step_denom = e_step_denom.transform(lambda x : 1 - np.prod(1 - x.values))
    df.loc[label_one_idx, 'new_target'] = df.loc[label_one_idx, 'new_target'] / e_step_denom
    new_y = df['new_target'].fillna(0)
    return new_y



In [None]:
# z - вероятность ответить игроком на вопрос, команда ответила на вопрос если 20 процентов команды ответили правильно
# сначала вероятность ответа на вопрос каждого игрока равна 50 процентам

In [26]:
z = []
for player_id in df_train.player_id.unique():
    z.append([player_id, 0.5])
df_z = pd.DataFrame(z)
df_z.columns = ['player_id', 'prob_get_right_answ']

In [42]:
dict_teams_by_tour_train = {}
for tour_id in df_train.tournament_id:
    dict_teams_by_tour_train[tour_id] = df_train[df_train.tournament_id == tour_id].team_id.unique()

KeyboardInterrupt: 

In [41]:
new_preds = []
for tour_id in df_train.tournament_id:
    teams = dict_teams_by_tour_train[tour_id]
    for team_id in teams:
        team_members = df_train[df_train.team_id == team_id].player_id.unique()
        count_right_answ = 0
        for player_id in team_members:
            p = df_z[df_z.player_id == player_id].prob_get_right_answ.values[0]
            q = 1 - p
            count_right_answ += np.random.choice([0, 1], p=[p, q])
        if count_right_answ / len(team_members) >= 0.2:
            for _ in team_members:
                new_preds.append(1)
        else:
            for _ in team_members:
                new_preds.append(0)
        

KeyboardInterrupt: 

In [None]:
def e_step(df, preds):

## EM

In [None]:
for i in range(n_epoch):
    y = e_step(df_train, preds)
    pipe, preds = m_step(pipe, X, y)
    weights = save_player_weights(X, pipe)
    save_obj(weights, f'em_weights_epoch_{i}')

In [None]:
X, y = df_train[['player_id', 'question_id']], df_train['answer']