In [21]:
import pandas as pd
import random

In [22]:
results = pd.read_csv('results.csv')
results.loc[:, 'mask'] = results.loc[:, 'mask'].str.replace('X', '')
roster = pd.read_csv('roster.csv')
tournaments = pd.read_csv('tournaments.csv').set_index('idtournament')
q_total = pd.DataFrame(results.groupby('idtournament').first()\
                       .loc[:, 'mask'].str.len()).dropna().rename(columns={'mask': 'questions_total'})
tournaments = pd.merge(tournaments.drop(columns=['questions_total']), q_total, left_index=True, right_index=True)

In [23]:
tournaments = tournaments.loc[~tournaments.questions_total.isnull(), :]
tournaments.head(1)

Unnamed: 0_level_0,name,town,long_name,date_start,date_end,tour_count,tour_questions,tour_ques_per_tour,type_name,main_payment_value,discounted_payment_value,discounted_payment_reason,date_requests_allowed_to,comment,site_url,main_payment_currency,discounted_payment_currency,questions_total
idtournament,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3236,Десятый блин,Великий Новгород,"Открытый Фестиваль интеллектуальных игр ""Десят...",2015-05-23 13:00:00,2015-05-24 16:00:00,6.0,12.0,,Обычный,2500.0,0.0,,,,,,,72.0


In [24]:
results.head(2)

Unnamed: 0,idteam,current_name,base_name,position,questions_total,mask,bonus_a,bonus_b,tech_rating,predicted_position,d_bonus_a,d_bonus_b,d_diff_bonus,included_in_rating,idtournament,diff_bonus
0,209,Команда Коваленко,Команда Коваленко,11.0,34,1111101000101111001100011000111101001111001110...,2230.0,-87.0,4509.0,7.0,2230,663,-87.0,1.0,3236,
1,270,Синоп,Синоп,14.5,31,1110110000101100001100101101101101010011010000...,2189.0,119.0,2567.0,20.0,2189,618,119.0,1.0,3236,


In [25]:
tour = random.sample(results.idtournament.unique().tolist(), 1)[0]
t_subset = results.loc[results.idtournament == tour, ['idteam', 'mask']]
team = random.sample(t_subset.idteam.tolist(), 1)[0]
quest = random.randint(0, int(tournaments.loc[tour, 'questions_total']))
quest_key = f"{str(tour)}_{str(quest)}"

In [29]:
def skill_init():
    return random.uniform(0, 10)
player_skills_dict = {id: skill_init() for id in roster.idplayer}
pd.DataFrame(player_skills_dict, index=[0]).T.head()

Unnamed: 0,0
19298,7.669761
7474,8.824228
15624,7.663667
32017,4.52626
13601,5.755676


In [27]:
def difficulty_init():
    return random.uniform(0, 10)
question_difficulty = {f"{str(tourn)}_{str(ques)}": difficulty_init() for tourn in tournaments.index 
                       for ques in range(int(tournaments.loc[tourn, 'questions_total']))}
pd.DataFrame(question_difficulty, index=[0]).T.head()

Unnamed: 0,0
3236_0,2.324919
3236_1,2.845083
3236_2,5.425041
3236_3,2.721184
3236_4,1.159253


In [28]:
def sigmoid(x):
    return 1 / (1 + pd.np.exp(-x))    

**Model**<br>
$p(y=1| S_{team}, \theta) = \sigma(S_{team} - \theta)$ <br>
$S_{team}$ - team skill<br>
$S_{team} = S_{player\_1} + S_{player\_2} + ... + S_{player\_N}$<br>
$\theta_j$ - difficulty of question j<br>
y - team answer, 1 for correct, 2 for incorrect<br>
$p(y_{hat}=y_{real}| S_{team}, \theta) = \sigma(S_{team} - \theta)^{y_{real}} * (1-\sigma(S_{team} - \theta))^{(1-y_{real})}$<br>
<br>
$LikelyHood = \frac{1}{N}\prod_{i=1}^{N} p(y_{hat\_i}=y_{real\_i}| S_{team}, \theta)$<br>
$log(LikelyHood) = LL = \frac{1}{N}\sum_{i=1}^{N} log(p(y_{hat\_i}=y_{real\_i}| S_{team}, \theta))=
y_{real}*log(\sigma(S_{team} - \theta)) + (1-y_{real})* log(1-\sigma(S_{team} - \theta))$<br>
<br>
$-log(LikelyHood) \rightarrow min$ w.r.t. $S_{player\_i}$ and $\theta_j$

**Derivatives:**

$\sigma'(x) = \sigma(x)*(1-\sigma(x))$

$\frac{\partial LL}{\partial \theta} = \frac{y_{real}}{\sigma(S_{team} - \theta)} * \sigma'(S_{team} - \theta) * (-1) +
\frac{1-y_{real}}{1-\sigma(S_{team} - \theta)} * (-\sigma'(S_{team} - \theta)) * (-1) = 
-y_{real}*(1-\sigma(S_{team} - \theta)) + (1-y_{real})*\sigma(S_{team} - \theta)
$<br>

$\frac{\partial LL}{\partial S_i} = \frac{y_{real}}{\sigma(S_{team} - \theta)} * \sigma'(S_{team} - \theta) +
\frac{1-y_{real}}{1-\sigma(S_{team} - \theta)} * (-\sigma'(S_{team} - \theta)) = 
y_{real}*(1-\sigma(S_{team} - \theta)) - (1-y_{real})*\sigma(S_{team} - \theta)$<br>


**Short derivatives:**

$\frac{\partial -LL}{\partial \theta} = +y_{real}*(1-\sigma(S_{team} - \theta)) - (1-y_{real})*\sigma(S_{team} - \theta)
$<br>

$\frac{\partial -LL}{\partial S_i} = -y_{real}*(1-\sigma(S_{team} - \theta)) + (1-y_{real})*\sigma(S_{team} - \theta)$<br>


**Gradient update**<br>
$S_{i\_new} = S_{i\_old} - learning\_rate * \frac{\partial LL}{\partial S_i}$<br>
$\theta_{new} = \theta_{old} - learning\_rate * \frac{\partial LL}{\partial \theta}$<br>

In [36]:
def gd(lr=0.01, epochs=10, steps_per_tour = 1):
    # get random tournament, team and question
    tour = random.sample(results.idtournament.unique().tolist(), 1)[0]
    t_subset = results.loc[results.idtournament == tour, ['idteam', 'mask']]
    team = random.sample(t_subset.idteam.tolist(), 1)[0]
    players = roster.loc[(roster.idteam == team) & (roster.idtournament == tour), 'idplayer'].tolist()
    quest = random.randint(0, int(tournaments.loc[tour, 'questions_total']))
    
    # get question difficulty
    quest_dif_key = f"{str(tour)}_{str(quest)}"
    difficulty = question_difficulty[quest_dif_key]
    
    # calculate team skill and sigmoid
    skills = [player_skills_dict[player] for player in players]
    team_skill = sum(skills)
    team_sigmoid = sigmoid(team_skill - difficulty)
    
    # get y (1 for correct answer, 0 for incorrect)
    y_true = int(t_subset.loc[t_subset.idteam == team, 'mask'].values[0][quest])
    
    # weights update
    question_difficulty[quest_dif_key] -= lr * (y_true * (1-team_sigmoid) - (1-y)*team_sigmoid)
    for player in players:
        player_skills_dict[player] -= lr * (-y_true * (1-team_sigmoid) + (1-y)*team_sigmoid)

SyntaxError: invalid syntax (<ipython-input-36-3ec95872f80e>, line 7)

In [71]:
t_subset.loc[:, 'mask'].str.replace('X', '')

90986    0000000110000010010010111010010000000000000010...
90987    0011011111110111111100111111100111011011011110...
90988    0111111101110111011101110000010110110011101110...
90989    0101011111110010011110110110010111111111101111...
90990    0101011111111100011001111011100110001011111110...
90991    0111110111111111111001100110100110111111111110...
90992    0111000111000110011000110011000000110000001000...
90993    0011100101100101011100111011010111100011111100...
90994    0101010101010110011001110010000110011110101000...
90995    0101000100100000011000000000000000100000001000...
90996    0110100100010010011000011000000000100000011010...
90997    0001000101100011011001010001010000100000001001...
90998    0101110111101111011001110110100111011011101010...
90999    0000000100000000010001000000000010100000000011...
91000    0010000010000010001000010000000000000000000000...
91001    0111110001010000011000110010100110000000000000...
91002    0101100111001011011010110011000100100000101001.

In [67]:
t_subset

Unnamed: 0,idteam,mask
90986,53,00000001100000100XXXXXXX100101110100100000XXXX...
90987,55,00110111111101111XXXXXXX111001111111001110XXXX...
90988,67,01111111011101110XXXXXXX111011100000101101XXXX...
90989,185,01010111111100100XXXXXXX111101101100101111XXXX...
90990,264,01010111111111000XXXXXXX110011110111001100XXXX...
90991,313,01111101111111111XXXXXXX110011001101001101XXXX...
90992,683,01110001110001100XXXXXXX110001100110000001XXXX...
90993,791,00111001011001010XXXXXXX111001110110101111XXXX...
90994,1021,01010101010101100XXXXXXX110011100100001100XXXX...
90995,1093,01010001001000000XXXXXXX110000000000000001XXXX...


In [63]:
len(results.loc[results.idtournament == tour, 'mask'].values[0])

72