In [1]:
import pandas as pd
import numpy as np
import json, pickle
import collections
from sklearn import linear_model
from scipy.stats import spearmanr, kendalltau
from scipy.sparse import csr_matrix
from scipy.special import expit, logit

In [2]:
tournaments = pickle.load(open('chgk/tournaments.pkl', 'rb'))
results = pickle.load(open('chgk/results.pkl', 'rb'))
players = pickle.load(open('chgk/players.pkl', 'rb'))

# 1. Подготовка данных

In [3]:
data_train_test = {2019: {}, 
                   2020: {}}
num_of_question = collections.Counter()

for index in tournaments.keys():
    cur_tourney = tournaments[index]
    year = int(cur_tourney['dateStart'][:4])
    
    try:
        questionQty = sum(cur_tourney.get('questionQty').values())
    except BaseException:
        questionQty = 0
        
    if year in data_train_test.keys():
        teamQty = 0
        result = {}
        answerQty = np.array([0] * questionQty)
        
        for cur_team in results[index]:
            mask = cur_team.get('mask')
            
            try:
                if len(mask) == questionQty:
                    teamQty += 1
                    mask = mask.replace('X', '0')
                    mask = mask.replace('?', '0')
                    mask = np.array([int(p) for p in mask])
                    answerQty += mask
                    result[cur_team['team']['id']] = {'answer': sum(mask),
                                                      'position': cur_team['position'],
                                                      'mask': mask,
                                                      'players': [p['player']['id'] for p in cur_team['teamMembers']]}
                        
                        
            except BaseException:
                continue

        data_train_test[year][index] = {'name' : cur_tourney['name'],
                             'questionQty': questionQty,
                             'answerQty': answerQty,
                             'teamQty': teamQty,
                             'result': result}
    elif year < 2019:
        for cur_team in results[index]:
            mask = cur_team.get('mask')
            try:
                if len(mask) == questionQty:
                    for i in [p['player']['id'] for p in cur_team['teamMembers']]:
                        num_of_question[i] += questionQty
            except BaseException:
                continue

In [4]:
del tournaments, results

In [5]:
def get_dataframe (data, num_of_question):
    result = {'tourney id': [], 
              'team id': [],
              'player id': [],
              'questions played':[],
              'question number': [],
              'question id': [],
              'true position': [],
              'result': []}

    ttl_question = 0
    new_question = collections.Counter()
    answer = collections.Counter()
    for tourney_id, tourney in data.items():
        questionQty = tourney['questionQty']
        question_number = list(range(questionQty))
        question_id = list(np.array(question_number) + ttl_question)
        ttl_question += questionQty
        
        for team_id, team in tourney['result'].items():
            playersQty = len(team['players'])
            result['tourney id'].extend([tourney_id] * playersQty * questionQty)
            result['team id'].extend([team_id] * playersQty * questionQty)
            result['result'].extend(list(team['mask']) * playersQty)
            result['question number'].extend(question_number * playersQty)
            result['question id'].extend(question_id * playersQty)
            result['true position'].extend([team['position']] * playersQty * questionQty)

            for player_id in team['players']:
                questions_played = list(num_of_question[player_id] + new_question[player_id]
                                        + np.array(question_number))
                new_question[player_id] += questionQty
                answer[player_id] += team['answer']
                result['player id'].extend([player_id] * questionQty)
                result['questions played'].extend(questions_played)
                
    
    return pd.DataFrame(result), new_question, answer

In [6]:
data_train, question_2019, answer_2019 = get_dataframe(data_train_test[2019], num_of_question)

In [7]:
data_test, question_2020, answer_2020 = get_dataframe(data_train_test[2020], num_of_question + question_2019)

In [8]:
data_train.head()

Unnamed: 0,tourney id,team id,player id,questions played,question number,question id,true position,result
0,4772,45556,6212,19741,0,0,1.0,1
1,4772,45556,6212,19742,1,1,1.0,1
2,4772,45556,6212,19743,2,2,1.0,1
3,4772,45556,6212,19744,3,3,1.0,1
4,4772,45556,6212,19745,4,4,1.0,1


# 2. Baseline-модель на основе логистической регрессии

$Q_{t}$ - множество вопросов из турнира t

$C_{q}$ - уровень сложности вопроса q

$T_{t}$ - множество команд из турнира t

$S_{i}$ - сила игрока i


Для baseline-модели будем считать, что если команда $\tau$ ответила на вопрос $C_{q}$ , то и каждый ее участник $(i ∈ \tau)$ ответил на этот вопрос:

$P(X_{iq} = 1 | C_{q}, S_{i}) = P(X_{\tau q} = 1| C_{q}) ∼ σ(μ + S_{i} + C_{q})$

In [9]:
border_question = 100

In [10]:
new_players = []
old_players = []

for i in (question_2019 + question_2020).keys():
    if num_of_question[i] + question_2019[i] < border_question:
        new_players += [i]
    else:
        old_players += [i]
    
new_index = {player_id:0 for player_id in new_players}
old_index = {player_id:index + 1 for index, player_id in enumerate(old_players)}
player_id_index = {**new_index, **old_index}
index_player_id = {index:player_id for player_id, index in player_id_index.items()}

In [11]:
num_of_players_col = len(old_players) + 1
len_data = data_train.shape[0]

data = np.array([1] * len_data * 2)
row = np.array(list(range(len_data)) * 2)

players_col = [player_id_index[index] for index in data_train['player id']]
questions_col = list(np.array(data_train['questions played']) + num_of_players_col)
col = np.array(players_col + questions_col)

X_train = csr_matrix((data, (row, col)), shape=(len_data, max(questions_col) + 1))
y_train = np.array(data_train['result'])

In [12]:
logregr = linear_model.LogisticRegression(solver='saga', C=10)
logregr.fit(X_train, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
len_data = data_test.shape[0]

data = np.array([1] * len_data)
row = np.array(list(range(len_data)))
col = np.array([player_id_index[index] for index in data_test['player id']])

X_test = csr_matrix((data, (row, col)), shape=(len_data, max(questions_col) + 1))

# 3. Качество предсказаний результатов турниров

Качество рейтинг-системы оценивается качеством предсказаний результатов турниров. 
Предскажем результаты турнира с известными командами и их сосавами:

1. Вероятность, что команда $\tau$ ответит на вопрос $C_{q}$ равна вероятности, что хотябы 1 из ее участников $(i ∈ \tau)$ ответи на этот вопрос. 
Следовательно, это (1 - вероятность того, что никто из участников команды не ответит на вопрос)

$P(X_{\tau q} = 1| C_{q}) = (1 - \prod\limits_{i ∈ \tau}(1 - P(X_{iq}  = 1| C_{q}, S_{i})) )$

2. Для построения рейтинга будем использовать вероятность, что команда $\tau$ ответит на все вопросы $C_{qt}$ из турнира $t$

$\prod\limits_{q ∈ t} P(X_{\tau q} = 1| C_{q})$

In [14]:
predict_train = logregr.predict_proba(X_train)
predict_test = logregr.predict_proba(X_test)

data_train['prob_answer'] = predict_train[:, 1]
data_train['prob_team_answer'] = data_train['result']

data_test['prob_answer'] = predict_test[:, 1]
data_test['prob_team_answer'] = data_test['result']

In [15]:
def get_team_results(data):
    data['prob_not_answer'] = 1 - data['prob_answer']
    team_result = data[['tourney id', 'team id', 'question number', 'prob_not_answer']].groupby(['tourney id', 'team id', 'question number']).agg('prod').reset_index()
    team_result['prob_team_answer'] = 1 - team_result['prob_not_answer']
    team_result = team_result.drop(columns='prob_not_answer')
    
    return team_result.reset_index()

def get_standings (data):
    standings = get_team_results(data).groupby(['tourney id', 'team id']).agg('prod').reset_index()
    standings['true position'] = data.groupby(['tourney id', 'team id']).mean().reset_index()['true position']
    standings = standings.sort_values(['tourney id','prob_team_answer'], ascending=[False, False])
    standings['predict position'] = standings.groupby('tourney id').cumcount() + 1
    return standings.drop(columns='question number')

def get_metrics (standings):
    Kendall = []
    Spearman = []
    for tourney_id in standings['tourney id'].unique():
        pred_pos = standings.loc[standings['tourney id'] == tourney_id]['predict position'].values
        true_pos = standings.loc[standings['tourney id'] == tourney_id]['true position'].values
        
        Kendall += [kendalltau(pred_pos, true_pos).correlation]
        Spearman += [spearmanr(pred_pos, true_pos).correlation]
        
        Kendall = np.array(Kendall)
        Spearman = np.array(Spearman)
        
        Kendall[np.isnan(Kendall)] = 0
        Spearman[np.isnan(Spearman)] = 0
    
    return np.array(Kendall).mean(), np.array(Spearman).mean()

In [16]:
standings_train = get_standings(data_train)
standings_test = get_standings(data_test)

standings_test.head()

Unnamed: 0,tourney id,team id,index,prob_team_answer,true position,predict position
21508,6456,71625,9.599871999999999e+230,0.982385,1.0,1
21507,6456,69918,9.582438e+230,0.894857,6.0,2
21502,6456,27285,9.495732e+230,0.796628,2.0,3
21504,6456,55612,9.530322e+230,0.673397,3.0,4
21503,6456,43261,9.513011e+230,0.430905,4.5,5


In [17]:
Kendall, Spearman = get_metrics(standings_test)
print('Коэф-т Спирмена:', Spearman)
print('Коэф-т Кендалла:', Kendall)

Коэф-т Спирмена: 72.0230290110904
Коэф-т Кендалла: 52.59811276087381


# 4. EM-алгоритм

#### E-шаг

$Z_{iq}$ - игрок $i$ из команды $\tau$ xответил на вопрос q.

1. Сделаем допущение, что если команда $\tau$ не ответила на вопрос $C_{q}$, то за столом не было правильной версии и ни один из игроков команды $\tau$ не ответил на этот вопрос.

2. Тогда как, если команда $\tau$ ответила на вопрос $C_{q}$, это значит, что хотя бы один из ее участников ответил на этот вопрос 

$ E[Z_{iq}] = \begin{equation*}
 \begin{cases}
   0, X_{\tau q} = 0
   \\
   \frac{P(X_{iq}  = 1| C_{q}, S_{i})}{(1 - \prod\limits_{j ∈ \tau}(1 - P(X_{jq}  = 1| C_{q}, S_{j})))}, X_{\tau q} = 1
 \end{cases}
\end{equation*} $

#### M-шаг

Обучим логистическую модель с целевой переменной $E[Z_{iq}]$, вычисленной на E-шаге

In [18]:
EPS = 1e-8
def e_step(data):
    team_result = get_team_results(data)
    data = data.drop(columns='prob_team_answer').merge(team_result,
                                                       on=['tourney id', 'team id', 'question number'])
    z = (data['prob_answer'] / data['prob_team_answer']).values
    z[data_train['result'].values == 0] = 0
    
    z[z < EPS] = EPS
    z[z > 1 - EPS] = 1 - EPS
    return z

In [20]:
# EM-алгоритм
for iIter in range(4):
    #E-шаг
    z = e_step(data_train)
    
    #M-шаг
    model = linear_model.Ridge(alpha=1/20)
    y = logit(z)
    model.fit(X_train, y)
    data_train['prob_answer'] = expit(model.predict(X_train))
    data_test['prob_answer'] = expit(model.predict(X_test))
    
    
    standings_train = get_standings(data_train)
    standings_test = get_standings(data_test)
    Kendall, Spearman = get_metrics(standings_test)
    
    print('Итерация:', iIter + 1)
    print('Коэф-т Спирмена: ', Spearman)
    print('Коэф-т Кендалла: ', Kendall, '\n')

Итерация: 1
Коэф-т Спирмена:  66.03923138862741
Коэф-т Кендалла:  47.69528764748931 

Итерация: 2
Коэф-т Спирмена:  72.89721920529898
Коэф-т Кендалла:  53.19508367277897 

Итерация: 3
Коэф-т Спирмена:  72.23395939985545
Коэф-т Кендалла:  52.60000977896257 

Итерация: 4
Коэф-т Спирмена:  72.30641520483641
Коэф-т Кендалла:  52.68490746388562 



# 5.1. ТОП турниров по сложности вопросов

In [21]:
rating_question = pd.DataFrame({'question id' : range(len(model.coef_) - num_of_players_col),
                                'rating' : model.coef_[num_of_players_col:]})

rating_question = rating_question.merge(data_train[['question id', 'tourney id', 'question number']]
                                                .groupby(['question id'])
                                                .mean()
                                                .reset_index(),
                                        on=['question id'])

rating_question['tourney name'] = [data_train_test[2019][i]['name'] for i in rating_question['tourney id']]
rating_question['teamQty'] = [data_train_test[2019][i]['teamQty'] for i in rating_question['tourney id']]
rating_question['answerQty'] = [data_train_test[2019][i]['answerQty'][j] 
                                for i,j in zip(rating_question['tourney id'], rating_question['question number'])]
rating_question['share of answers'] = rating_question['answerQty'] / rating_question['teamQty']

rating_question = rating_question.sort_values('rating', ascending=False)

In [22]:
print('ТОП-10 сложных вопросов')
rating_question.head(10)

ТОП-10 сложных вопросов


Unnamed: 0,question id,rating,tourney id,question number,tourney name,teamQty,answerQty,share of answers
32395,33067,13.045328,6150,175,Чемпионат Санкт-Петербурга. Высшая лига,12,11,0.916667
32902,33574,12.471005,6249,205,Школьный синхрон-lite. Сезон 3,15,11,0.733333
30822,31446,11.926581,6090,232,Дзержинский марафон,4,2,0.5
32288,32960,11.434113,6150,68,Чемпионат Санкт-Петербурга. Высшая лига,12,10,0.833333
31202,31874,11.087997,6103,1,Українська ліга. Етап 2,59,34,0.576271
33261,33933,10.891244,6255,96,ОВСЧ,352,281,0.798295
32354,33026,10.554851,6150,134,Чемпионат Санкт-Петербурга. Высшая лига,12,7,0.583333
32072,32744,10.480881,6149,32,Чемпионат Санкт-Петербурга. Первая лига,39,0,0.0
32950,33622,10.404775,6254,1,Школьная лига,193,46,0.238342
32932,33604,10.32146,6249,235,Школьный синхрон-lite. Сезон 3,15,4,0.266667


In [28]:
print('ТОП-10 легких вопросов')
rating_question.tail(5)

ТОП-10 легких вопросов


Unnamed: 0,question id,rating,tourney id,question number,tourney name,teamQty,answerQty,share of answers
32220,32892,-9.199869,6150,0,Чемпионат Санкт-Петербурга. Высшая лига,12,10,0.833333
32995,33667,-9.19988,6254,46,Школьная лига,193,47,0.243523
32174,32846,-9.200379,6149,134,Чемпионат Санкт-Петербурга. Первая лига,39,0,0.0
33361,34033,-9.200507,6255,196,ОВСЧ,352,231,0.65625
32623,33295,-9.200763,6173,58,Кубок Мэра Казани,16,4,0.25


# 5.2. ТОП игроков

In [24]:
rating_player = pd.DataFrame({'index' : range(num_of_players_col),
                              'rating' : model.coef_[:num_of_players_col]})
rating_player['player id'] = [index_player_id[i] for i in rating_player['index']]
rating_player['name'] = [players[i]['surname'] + ' ' + players[i]['name'] 
                         for i in rating_player['player id']]
rating_player['before_2019'] = [num_of_question[i] for i in rating_player['player id']]
rating_player['question_2019'] = [question_2019[i] for i in rating_player['player id']]
rating_player['team_answer_2019'] = [answer_2019[i] for i in rating_player['player id']]

rating_player = rating_player.sort_values('rating', ascending=False)

In [25]:
print('ТОП-10 лучших игроков')
rating_player.head(10)

ТОП-10 лучших игроков


Unnamed: 0,index,rating,player id,name,before_2019,question_2019,team_answer_2019
25594,25594,10.146083,22474,Немец Илья,407,75,45
10788,10788,9.654186,169939,Таратин Иван,252,72,39
35236,35236,8.005437,152889,Гольдштейн Анна,72,36,30
20933,20933,8.000879,73534,Смерецкий Георгий,369,72,43
31798,31798,7.885756,170977,Кан Давид,108,36,32
31799,31799,7.815218,171845,Завьялов Михаил,144,36,32
35707,35707,7.750458,222188,Гринко Арина,0,216,179
35158,35158,7.720173,216863,Гаврилов Глеб,0,252,207
13645,13645,7.643376,121433,Савенко София,711,36,33
12001,12001,7.577618,167987,Махова Ярослава,48,72,56


In [26]:
print('ТОП-10 игроков, которым надо приложить силы')
rating_player.tail(10)

ТОП-10 игроков, которым надо приложить силы


Unnamed: 0,index,rating,player id,name,before_2019,question_2019,team_answer_2019
17627,17627,-5.771917,89362,Салаев Кянан,561,41,0
24917,24917,-5.924415,102151,Ширай Ольга,5375,96,12
15818,15818,-5.998195,30231,Сотникова Елена,4150,48,6
16046,16046,-6.029259,26950,Рогожин Антон,2691,48,3
31950,31950,-6.081616,39911,Ахмедов Гамид,5379,36,4
20504,20504,-6.26691,69199,Княжевский Борис,6012,288,35
35482,35482,-6.330514,139407,Килин Алексей,3976,36,3
24902,24902,-6.384274,64613,Ааронян Арсен,7046,111,14
30250,30250,-6.962194,46069,Штерн Александра,2446,48,0
33148,33148,-6.970623,54642,Жалова Мария,8179,45,6
