In [1]:
import pandas as pd
import random

In [2]:
results = pd.read_csv('results.csv')
results.loc[:, 'mask'] = results.loc[:, 'mask'].str.replace('X', '')
roster = pd.read_csv('roster.csv').set_index('idplayer')
tournaments = pd.read_csv('tournaments.csv').set_index('idtournament')
tournaments = tournaments.loc[~tournaments.questions_total.isnull(), :]
common_tours = list(set(tournaments.index).intersection(set(results.idtournament)))
results = results.loc[results.idtournament.isin(common_tours), :]
tournaments = tournaments.loc[common_tours, :]

q_total = pd.DataFrame(results.groupby('idtournament').first()\
                       .loc[:, 'mask'].str.len()).dropna().rename(columns={'mask': 'questions_total'})
tournaments = pd.merge(tournaments.drop(columns=['questions_total']), q_total, left_index=True, right_index=True)

In [3]:
tournaments.head(1)

Unnamed: 0_level_0,name,town,long_name,date_start,date_end,tour_count,tour_questions,tour_ques_per_tour,type_name,main_payment_value,discounted_payment_value,discounted_payment_reason,date_requests_allowed_to,comment,site_url,main_payment_currency,discounted_payment_currency,questions_total
idtournament,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,Летние зори,Саранск,,2003-08-09 00:00:00,2003-08-09 00:00:00,3.0,15.0,,Обычный,,,,,,,,,15.0


In [4]:
results.head(2)

Unnamed: 0,idteam,current_name,base_name,position,questions_total,mask,bonus_a,bonus_b,tech_rating,predicted_position,d_bonus_a,d_bonus_b,d_diff_bonus,included_in_rating,idtournament,diff_bonus
0,209,Команда Коваленко,Команда Коваленко,11.0,34,1111101000101111001100011000111101001111001110...,2230.0,-87.0,4509.0,7.0,2230,663,-87.0,1.0,3236,
1,270,Синоп,Синоп,14.5,31,1110110000101100001100101101101101010011010000...,2189.0,119.0,2567.0,20.0,2189,618,119.0,1.0,3236,


In [5]:
roster.head(1)

Unnamed: 0_level_0,is_captain,is_base,is_foreign,idteam,idtournament
idplayer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19298,1,1,0,209,3236


In [6]:
def skill_init():
    return random.uniform(0, 1)
roster.loc[:, 'skill'] = None
roster.loc[:, 'skill'] = roster.loc[:, 'skill'].apply(lambda x: skill_init())
roster.head(1)
# player_skills_dict = {id: skill_init() for id in roster.idplayer}
# pd.DataFrame(player_skills_dict, index=[0]).T.head()

Unnamed: 0_level_0,is_captain,is_base,is_foreign,idteam,idtournament,skill
idplayer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19298,1,1,0,209,3236,0.236561


In [7]:
def difficulty_init():
    return random.uniform(0, 1)
question_difficulty = {f"{str(tourn)}_{str(ques)}": difficulty_init() for tourn in tournaments.index 
                       for ques in range(int(tournaments.loc[tourn, 'questions_total']))}
pd.DataFrame(question_difficulty, index=[0]).T.head()

Unnamed: 0,0
2_0,0.722533
2_1,0.11458
2_2,0.030757
2_3,0.343289
2_4,0.893202


In [8]:
def sigmoid(x):
    return 1 / (1 + pd.np.exp(-x))    

**Model**<br>
$p(y=1| S_{team}, \theta) = \sigma(S_{team} - \theta)$ <br>
$S_{team}$ - team skill<br>
$S_{team} = S_{player\_1} + S_{player\_2} + ... + S_{player\_N}$<br>
$\theta_j$ - difficulty of question j<br>
y - team answer, 1 for correct, 2 for incorrect<br>
$p(y_{hat}=y_{real}| S_{team}, \theta) = \sigma(S_{team} - \theta)^{y_{real}} * (1-\sigma(S_{team} - \theta))^{(1-y_{real})}$<br>
<br>
$LikelyHood = \frac{1}{N}\prod_{i=1}^{N} p(y_{hat\_i}=y_{real\_i}| S_{team}, \theta)$<br>
$log(LikelyHood) = LL = \frac{1}{N}\sum_{i=1}^{N} log(p(y_{hat\_i}=y_{real\_i}| S_{team}, \theta))=
y_{real}*log(\sigma(S_{team} - \theta)) + (1-y_{real})* log(1-\sigma(S_{team} - \theta))$<br>
<br>
$-log(LikelyHood) \rightarrow min$ w.r.t. $S_{player\_i}$ and $\theta_j$

**Derivatives:**

$\sigma'(x) = \sigma(x)*(1-\sigma(x))$

$\frac{\partial LL}{\partial \theta} = \frac{y_{real}}{\sigma(S_{team} - \theta)} * \sigma'(S_{team} - \theta) * (-1) +
\frac{1-y_{real}}{1-\sigma(S_{team} - \theta)} * (-\sigma'(S_{team} - \theta)) * (-1) = 
-y_{real}*(1-\sigma(S_{team} - \theta)) + (1-y_{real})*\sigma(S_{team} - \theta)
$<br>

$\frac{\partial LL}{\partial S_i} = \frac{y_{real}}{\sigma(S_{team} - \theta)} * \sigma'(S_{team} - \theta) +
\frac{1-y_{real}}{1-\sigma(S_{team} - \theta)} * (-\sigma'(S_{team} - \theta)) = 
y_{real}*(1-\sigma(S_{team} - \theta)) - (1-y_{real})*\sigma(S_{team} - \theta)$<br>


**Short derivatives:**

$\frac{\partial -LL}{\partial \theta} = +y_{real}*(1-\sigma(S_{team} - \theta)) - (1-y_{real})*\sigma(S_{team} - \theta)
$<br>

$\frac{\partial -LL}{\partial S_i} = -y_{real}*(1-\sigma(S_{team} - \theta)) + (1-y_{real})*\sigma(S_{team} - \theta)$<br>


**Gradient update**<br>
$S_{i\_new} = S_{i\_old} - learning\_rate * \frac{\partial LL}{\partial S_i}$<br>
$\theta_{new} = \theta_{old} - learning\_rate * \frac{\partial LL}{\partial \theta}$<br>

In [9]:
from tqdm import tqdm_notebook

In [10]:
def gd(lr=0.01, epochs=10, steps_per_tour = 10, certain_tour=None):
    fails=0
    fail_list = []
    for _ in tqdm_notebook(range(epochs)):
#     for _ in range(epochs):
        # get random tournament, team and question

        if certain_tour:
            tour = certain_tour
        else:
            tour = random.sample(tournaments.index.tolist(), 1)[0]

        t_subset = results.loc[results.idtournament == tour, ['idteam', 'mask']]
        for _ in tqdm_notebook(range(steps_per_tour)):
#         for _ in range(steps_per_tour):
            team = random.sample(t_subset.idteam.tolist(), 1)[0]
            players = roster.loc[(roster.idteam == team) & (roster.idtournament == tour), :].index.tolist()
            quest = random.randint(0, int(tournaments.loc[tour, 'questions_total'])-1)

            # get question difficulty
            quest_dif_key = f"{str(tour)}_{str(quest)}"
            try:
                difficulty = question_difficulty[quest_dif_key]
            except:
                print('!', end='')
                fails+=1
                fail_list.append(quest_dif_key)
                continue

            skills = roster.loc[(roster.index.isin(players)) &
                                (roster.idteam == team) &
                                (roster.idtournament == tour), 'skill'].tolist()
            team_skill = sum(skills)
            team_sigmoid = sigmoid(team_skill - difficulty)

            # get y (1 for correct answer, 0 for incorrect)
            y_true = int(t_subset.loc[t_subset.idteam == team, 'mask'].values[0][quest])

            # weights update
            question_difficulty[quest_dif_key] -= lr * (y_true * (1-team_sigmoid) - (1-y_true)*team_sigmoid)
            roster.loc[players, 'skill'] -= lr * (-y_true * (1-team_sigmoid) + (1-y_true)*team_sigmoid) / 6
    print(f"Fails: {fails}")
    return fail_list

In [11]:
def score_one_answer(tour, team, quest):
    quest_dif_key = f"{str(tour)}_{str(quest)}"
    difficulty = question_difficulty[quest_dif_key]
    players = roster.loc[(roster.idteam == team) & (roster.idtournament == tour), :].index.tolist()
    skills = roster.loc[(roster.index.isin(players)) &
                                (roster.idteam == team) &
                                (roster.idtournament == tour), 'skill'].tolist()
    team_skill = sum(skills)
    team_sigmoid = sigmoid(team_skill - difficulty)
    
    y_true = int(results.loc[(results.idtournament == tour) &
                             (results.idteam == team), 'mask'].values[0][quest])
    print(f"Y: {y_true}\nSigmoid: {team_sigmoid}")

In [40]:
fl = gd(lr=0.2, epochs=1, steps_per_tour = 1000, certain_tour=3236)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Fails: 0


In [41]:
score_one_answer(3236, 209, 0)

Y: 1
Sigmoid: 0.7534925462571402


In [45]:
score_one_answer(3236, 209, 7)

Y: 0
Sigmoid: 0.3917700920375212


In [15]:
results

Unnamed: 0,idteam,current_name,base_name,position,questions_total,mask,bonus_a,bonus_b,tech_rating,predicted_position,d_bonus_a,d_bonus_b,d_diff_bonus,included_in_rating,idtournament,diff_bonus
0,209,Команда Коваленко,Команда Коваленко,11.0,34,1111101000101111001100011000111101001111001110...,2230.0,-87.0,4509.0,7.0,2230,663,-87.0,1.0,3236,
1,270,Синоп,Синоп,14.5,31,1110110000101100001100101101101101010011010000...,2189.0,119.0,2567.0,20.0,2189,618,119.0,1.0,3236,
2,358,43,43,20.0,26,1111110000011100000010011111100001000111001000...,2074.0,-100.0,3532.0,9.0,2074,501,-100.0,1.0,3236,
3,482,Нимлот,Нимлот,14.5,31,1101100000001110001011011101111111101111100100...,2189.0,-29.0,3376.0,10.0,2189,618,-29.0,1.0,3236,
4,1948,Doom-Doom,Doom-Doom,17.5,29,1111010010110110001000011001101100101001000000...,2145.0,38.0,2629.0,19.0,2145,557,38.0,1.0,3236,
5,2298,Макароны под плинтусом,Макароны под плинтусом,24.0,21,1100000010000100000100011101101001101101000000...,1721.0,-29.0,2086.0,23.0,1721,311,-29.0,1.0,3236,
6,3213,Дикие бозоны Хиггса,Дикие бозоны Хиггса,3.0,49,1110111011111111011110111101111111001111011000...,0.0,8.0,6161.0,3.0,2290,1197,8.0,1.0,3236,
7,3930,Версий.net,Версий.net,24.0,21,1110101000100000001110000101100111001001000000...,1721.0,20.0,1562.0,25.0,1721,311,20.0,1.0,3236,
8,3951,Eclipse,Eclipse,5.0,44,1110110111100111101010011111101110101101101010...,2280.0,-50.0,6019.0,4.0,2280,1012,-50.0,1.0,3236,
9,4822,Беспредел,Беспредел,26.5,20,1100001100001000000100000101000101101111010000...,1399.0,69.0,1105.0,28.0,1399,247,69.0,1.0,3236,


In [16]:
fl


[]

In [17]:
question_difficulty['450_13']

0.3050268334131976

In [18]:
results.loc[results.idtournament==450, 'mask'].str[0]

193447    0
193448    0
193449    0
193450    0
193451    0
193452    0
193453    0
193454    0
193455    0
193456    0
193457    0
193458    0
193459    0
193460    0
193461    0
193462    0
193463    0
193464    0
193465    0
193466    0
193467    0
193468    0
193469    0
193470    0
193471    0
193472    0
193473    0
193474    0
193475    0
193476    0
Name: mask, dtype: object