In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy import stats

In [2]:
!ls chgk

players.pkl  results.pkl  tournaments.pkl


In [3]:
with open("./chgk/players.pkl", "rb") as file:
    player_data = pickle.load(file)

with open("./chgk/results.pkl", "rb") as file:
    result_data = pickle.load(file)

with open("./chgk/tournaments.pkl", "rb") as file:
    tournament_data = pickle.load(file)

## 1. Анализ данных

In [4]:
print(player_data[1])

{'id': 1, 'name': 'Алексей', 'patronymic': None, 'surname': 'Абабилов'}


In [5]:
print(result_data[1][0])

{'team': {'id': 242, 'name': 'Команда Азимова', 'town': {'id': 21, 'name': 'Баку'}}, 'mask': None, 'current': {'name': 'Команда Азимова', 'town': {'id': 21, 'name': 'Баку'}}, 'questionsTotal': 0, 'synchRequest': None, 'position': 1, 'controversials': [], 'flags': [], 'teamMembers': [{'flag': None, 'usedRating': 0, 'rating': 0, 'player': {'id': 476, 'name': 'Анар', 'patronymic': 'Беюкага оглы', 'surname': 'Азимов'}}, {'flag': None, 'usedRating': 0, 'rating': 0, 'player': {'id': 878, 'name': 'Фариз', 'patronymic': 'Наим оглы', 'surname': 'Аликишибеков'}}, {'flag': None, 'usedRating': 0, 'rating': 0, 'player': {'id': 1872, 'name': 'Аднан', 'patronymic': 'Фариз оглы', 'surname': 'Ахундов'}}, {'flag': None, 'usedRating': 0, 'rating': 0, 'player': {'id': 13721, 'name': 'Балаш', 'patronymic': 'Алекпер оглы', 'surname': 'Касумов'}}, {'flag': None, 'usedRating': 0, 'rating': 0, 'player': {'id': 22132, 'name': 'Анар', 'patronymic': 'Джафар оглы', 'surname': 'Наджафли'}}, {'flag': None, 'usedRati

In [6]:
print(tournament_data[1])

{'id': 1, 'name': 'Чемпионат Южного Кавказа', 'dateStart': '2003-07-25T00:00:00+04:00', 'dateEnd': '2003-07-27T00:00:00+04:00', 'type': {'id': 2, 'name': 'Обычный'}, 'season': '/seasons/1', 'orgcommittee': [], 'synchData': None, 'questionQty': None}


In [7]:
def check_mask(inp):
    uniques = set(inp)
    return '1' in uniques and '0' in uniques and len(uniques) == 2

# filtering data
keys_to_train = []
keys_to_test = []
keys_to_delete = [] # care of memory
for key, value in tournament_data.items():
    if '2019-' in value['dateStart']:
        count_mask = 0
        for element in result_data[key]:
            if ('mask' in element and 'teamMembers' in element and element['mask'] 
                and element['teamMembers']):
                count_mask = 1
        if count_mask > 0:
            keys_to_train.append(key)
        

        
    elif '2020-' in value['dateStart']:
        count_mask = 0
        for element in result_data[key]:
            if ('mask' in element and 'teamMembers' in element and element['mask'] 
                and element['teamMembers'] and check_mask(element['mask'])):
                count_mask += 1
        if count_mask == len(result_data[key]) and count_mask > 0:
            keys_to_test.append(key)
    else:
        keys_to_delete.append(key)   

In [8]:
for key in keys_to_delete:
    del tournament_data[key]
    del result_data[key]

In [9]:
print(len(keys_to_train), len(keys_to_test))

675 133


In [10]:
X_train = []

y_train = []

team_ids = []

for key in keys_to_train:
    tourn_tmp = tournament_data[key]['id']
    for team in result_data[key]:
        answers = team['mask']
        team_members = team['teamMembers']
        if not answers:
            continue
        for index, answer in enumerate(answers):
            for team_member in team_members:
                X_train.append([tourn_tmp, index, team_member['player']['id']])
                y_train.append(answer)
                team_ids.append(team['team']['id'])

In [11]:
X_train_copy = X_train.copy()

In [12]:
player_to_norm_id = {}
answer_to_norm_id = {}

counter_player = 0
counter_answer = 0
for i in range(len(X_train)):
    data_element = X_train[i]

    if tuple(data_element[:2]) not in answer_to_norm_id:
        answer_to_norm_id[tuple(data_element[:2])] = counter_answer
        counter_answer += 1
    if data_element[2] not in player_to_norm_id:
        player_to_norm_id[data_element[2]] = counter_player
        counter_player += 1
    
    X_train[i] = [answer_to_norm_id[tuple(data_element[:2])], player_to_norm_id[data_element[2]]]

In [13]:
question_ids = []
for i in range(len(X_train)):
    data_element = X_train_copy[i]
    question_ids.append(answer_to_norm_id[tuple(data_element[:2])])

In [14]:
del X_train_copy

In [15]:
y_train = np.array(y_train)
X_train = np.array(X_train)
y_train[y_train=='X'] = '?'
indixes = (y_train != '?')
y_train = y_train[indixes]
X_train = X_train[indixes]

team_ids = np.array(team_ids)
question_ids = np.array(question_ids)

team_ids = team_ids[indixes]
question_ids = question_ids[indixes]

In [16]:
print(counter_answer, counter_player)

33385 59101


In [17]:
y_train = list(map(int, y_train))

In [18]:
# convert to one-hot encoding
X_train_one_hot = sparse.lil_matrix((len(X_train), counter_answer + counter_player), dtype=np.int32)

In [19]:
X_train_one_hot.shape

(20910740, 92486)

In [20]:
for i in range(len(X_train)):
    ind1, ind2 = X_train[i]
    X_train_one_hot[i, ind1]  = 1
    X_train_one_hot[i, counter_answer + ind2]  = 1

In [21]:
del X_train

## 2 Baseline model

данные на входе - объединение one-hot вектора игрока и one-hot вектора вопроса, по ним предсказываем смог ли игрок ответить на данный вопрос.

In [22]:
%%time
model = LogisticRegression(tol=1e-1, solver='saga')
model.fit(X_train_one_hot, y_train)

CPU times: user 55.7 s, sys: 252 ms, total: 55.9 s
Wall time: 55.9 s


LogisticRegression(solver='saga', tol=0.1)

In [23]:
accuracy_score(y_train, model.predict(X_train_one_hot))

0.7621513633663849

## 3 Оценка качества рейтинг системы

Считаем что команда отвечает на вопрос если хотя бы один ее член может ответить на вопрос т.е.:

$P(\textbf{team}) = [1 - p(\textbf{player1 failed}) \times p(\textbf{player2 failed})... \times p(\textbf{playerN failed})]$

Оцениваем по **0 вопросу** (Можно в среднем по вопросам, разницы нет).

Сравниваем с частотой ответа команды на вопросы - количество единичек в маске

Работаем обязательно на тест сете: **keys_to_test**

In [24]:
def calculate_labels_pred(model):
    labels_tournament = []
    tournaments_players = []
    id_to_except = []
    for key in keys_to_test:
        
        
        flag_to_except = False
        tournament_result = []
        
        tournament_player_team = []
        for team in result_data[key]:
            team_id = team['team']['id']
            
            target_score = sum(list(map(int, team['mask'])))
            team_score = 1
            for player in team['teamMembers']:
                if player['player']['id'] in player_to_norm_id:
                    tmp = sparse.lil_matrix((1, counter_answer + counter_player), dtype=int)
                    tmp[0, counter_answer + player_to_norm_id[player['player']['id']]] = 1
                    tmp[0, 0] = 1
                    team_score *= model.predict_proba(tmp)[0, 0]
                
                else:
                    flag_to_except = False
            
            tournament_player_team += [[team_id, 1 - team_score]]
            tournament_result += [[team_id, target_score]]
        if not flag_to_except:
            tournaments_players.append(tournament_player_team)
            labels_tournament.append(tournament_result)
    return labels_tournament, tournaments_players

def calculate_correlations(labels_tournament, tournaments_players):
    ans_spearman = []
    ans_kendall = []
    assert len(labels_tournament) == len(tournaments_players)
    for target, pred in zip(labels_tournament, tournaments_players):
        pred = np.array(pred)
        target = np.array(target)
        ans_spearman += [stats.spearmanr(pred[:, 1], target[:, 1]).correlation]
        ans_kendall += [stats.kendalltau(pred[:, 1], target[:, 1]).correlation]
    print('Spearman correlation: {} Kendall correlation: {}'.format(
        sum(ans_spearman) / len(ans_spearman), sum(ans_kendall) / len(ans_kendall)))

In [25]:
%%time
# считаем на тестовых данных корреляции Спирмана и Кэнделла
labels, preds = calculate_labels_pred(model)
calculate_correlations(labels, preds)

Spearman correlation: 0.7683993921664228 Kendall correlation: 0.6113482574828197
CPU times: user 11 s, sys: 36.8 ms, total: 11.1 s
Wall time: 11 s


## 4 Учет командной игры

Основная идея:

- В предыдущем пункте мы выучили вероятности того что конкретный игрок ответит на конкретный вопрос, без учета команды;

- В этом пункте учтем наличие команды как скрытой переменной: вектор $Z$ состоит из 4 вероятностей $p(\textbf{player} | \textbf{team})$, переменные тут бинарные;

- $p(\textbf{player}=0|\textbf{team}=0) = 1, p(\textbf{player}=1|\textbf{team}=0) = 0$, если команда не ответила, то ни один игрок не знал ответ;

- $p(\textbf{player}=1|\textbf{team}=1), p(\textbf{player}=0 | \textbf{team}=1)$ - нетривиальные вероятности, их и будем учить;

- В EM алгоритме на Е-шаге будем пересчитывать вероятности игрока ответить на вопрос с учетом правильного ответа команды: $\large p(\textbf{player}=1 | \textbf{team}=1) = \frac{p(\textbf{team}=1|\textbf{player}=1)\times p(\textbf{player}=1)}{p(\textbf{team}=1)} = \frac{p(\textbf{player}=1)}{p(\textbf{team}=1)}$;



- в M-шаге учим логистическую регрессию как в пункте **3 Оценка качества рейтинг системы** на новых вероятностях из E-шага.

In [26]:
# в EM схеме нужна логистическая регрессия с непрерывными значениями таргетов
class LogisticRegressionWithSoftTargets:
    def __init__(self, model):
        self.number_iterations = 100_000
        self.lr = 10
        self.batch_size = 1000
        self.tol = 0.0000001
        self.weights = np.hstack((model.intercept_, model.coef_[0]))

    def fit(self, X, y):
        X = sparse.hstack([np.ones(len(y)).reshape(-1, 1), X], format='csr')
        loss_list = []
        prev_loss_mean_val = np.inf
        for i in range(self.number_iterations):
            indixes = np.random.choice(X.shape[0], self.batch_size)
            X_batch = X[indixes, :]
            y_batch = y[indixes]
            predictions = 1 / (1 + np.exp(-X_batch.dot(self.weights)))
            
            loss = -np.mean(y_batch * np.log(predictions) + (1 - y_batch) * np.log(1 - predictions))
            loss_list.append(loss)
            
            if i> 0 and i % 1000 == 0:
                new_loss_mean_val = np.mean(loss_list)
                if prev_loss_mean_val - new_loss_mean_val < self.tol:
                    print("NLL mean: ", new_loss_mean_val)
                    break
                loss_list = []
                prev_loss_mean_val = new_loss_mean_val
            
            gradient = -X_batch.T.dot(y_batch - predictions) / len(y_batch)
            self.weights -= self.lr * gradient

    def predict_proba(self, X):
        X = sparse.hstack([np.ones(X.shape[0]).reshape(-1, 1), X])

        predictions = 1 / (1 + np.exp(-X.dot(self.weights)))
        return np.hstack(((1 - predictions).reshape(-1, 1), predictions.reshape(-1, 1)))        

In [27]:
estimator = LogisticRegressionWithSoftTargets(model)


labels, preds = calculate_labels_pred(model)
calculate_correlations(labels, preds)

for _ in range(10):
    preds = estimator.predict_proba(X_train_one_hot)
    
    # E step
    
    data_team = pd.DataFrame({'team': team_ids, 'question': question_ids, 
                        'fail_of_prediction': preds[:, 0]})
    dictionary_state = {}
    for team, question, fail_prediction in data_team.values:
        if (team, question) in dictionary_state:
            dictionary_state[(team, question)] *= fail_prediction
        else:
            dictionary_state[(team, question)] = fail_prediction
    
    new_columns_vals = []
    for team, question, fail_prediction in data_team.values:
        new_columns_vals += [1 - dictionary_state[(team, question)]]
    
    data_team['team_success'] = new_columns_vals
    del dictionary_state, new_columns_vals
    
    z = np.clip((1 - data_team['fail_of_prediction'].values) / data_team['team_success'].values, 0, 1)
    

    
    # обнуляем вероятности Z когда команда не ответила на вопрос
    z[np.array(y_train) == 0] = 0
    
    
    # M step
    estimator.fit(X_train_one_hot, z)
    
    
    
    # считаем метрики на тест сете (за 2020 год)
    labels, predictions = calculate_labels_pred(estimator)
    calculate_correlations(labels, predictions)

Spearman correlation: 0.7683993921664228 Kendall correlation: 0.6113482574828197
NLL mean:  0.3914276321493574
Spearman correlation: 0.7903297234741925 Kendall correlation: 0.6335501081591189
NLL mean:  0.35069600211470864
Spearman correlation: 0.801188679371039 Kendall correlation: 0.6451459501355014
NLL mean:  0.33096014189128714
Spearman correlation: 0.8066302792629125 Kendall correlation: 0.6510665191978618
NLL mean:  0.320372321999943
Spearman correlation: 0.8099437208466164 Kendall correlation: 0.6541825924760943
NLL mean:  0.3168957142929127
Spearman correlation: 0.8132352489666265 Kendall correlation: 0.6579823417330826
NLL mean:  0.3140378935207219
Spearman correlation: 0.8138624038733894 Kendall correlation: 0.6590524469390375
NLL mean:  0.3134119346680254
Spearman correlation: 0.8144222459004987 Kendall correlation: 0.6593868094184022
NLL mean:  0.3123075342586997
Spearman correlation: 0.8154591494330191 Kendall correlation: 0.6607789386677916
NLL mean:  0.31298554704517817


## 5 Рейтинг лист турниров

In [28]:
id_to_answer = {value:id_ for id_, value in answer_to_norm_id.items()}

In [29]:
answer_coefs = model.coef_[0][:counter_answer]

In [30]:
tournament_scores = {}
for id_ in id_to_answer:
    tournament, _  = id_to_answer[id_]
    if tournament in tournament_scores:
        tournament_scores[tournament] += [answer_coefs[id_]]
    else:
        tournament_scores[tournament] = [answer_coefs[id_]]

In [31]:
# считаем медианную сложность вопроса у каждого турнира
tournament_scores = {key: np.median(score) for key, score in tournament_scores.items()}

In [32]:
# tournament with low value is hard
tournament_scores = sorted(tournament_scores.items(), key=lambda x: x[1])

In [33]:
# сильнейшие турниры
for i in range(10):
    print(tournament_data[tournament_scores[i][0]]['name'])

Чемпионат Санкт-Петербурга. Первая лига
Первенство правого полушария
Угрюмый Ёрш
Чемпионат Минска. Лига А. Тур четвёртый
Воображаемый музей
Кубок городов
Синхрон высшей лиги Москвы
Записки охотника
Тихий Донец: омут первый
Чемпионат Таджикистана


In [34]:
# слабейшие турниры
for i in range(10):
    print(tournament_data[tournament_scores[-i][0]]['name'])

Чемпионат Санкт-Петербурга. Первая лига
Шестой киевский марафон. Асинхрон
Синхрон Лиги Разума
(а)Синхрон-lite. Лига старта. Эпизод V
(а)Синхрон-lite. Лига старта. Эпизод III
Студенческий чемпионат Калининградской области
(а)Синхрон-lite. Лига старта. Эпизод IX
(а)Синхрон-lite. Лига старта. Эпизод IV
Школьная лига
(а)Синхрон-lite. Лига старта. Эпизод VII


**Результаты соответствуют ожиданиям:**

Среди сильнейших турниров: чемпионаты стран и крупнейших городов России.

Среди слабейших турниров: Школьная лига + Лига старта