In [52]:
import numpy as np
import random


N = 10
M = 20
R = 3  # Dimension of the parameters
ALPHA = 0.4
BETA = 0.1


diffs = np.abs(np.random.rand(M, R))

def proba(theta, diff):
    return 1 / (1 + exp(-theta.dot(diff)))

def reg_relu(x):
    return np.log(np.exp(x) + 1)


# Data 1: Données (user_id, question_id, outcome) (en fonction du timestamp)
# Déterminer Learner's alpha et bêta

# Asker fait son planning
# Et cherche à poser des questions à un Learner
# De manière à optimiser son reward
# Mieux que le ferait greedy qui pose les questions qui maximisent la trace de la Fisher information (pof)

# Learner connaît thêta
# Asker le connaît pas
# À la fin du jeu : on apprend le reward


class Learner:
    def __init__(self):
        self.initial_theta = np.random.uniform(-1, 1, size=R)
        self.theta = np.copy(self.initial_theta)
        print('L true theta:', self.theta)

    def reply(self, question_id):
        diff = diffs[question_id]
        p = proba(self.theta, diff)
        print('L proba:', p)
        outcome = random.random() < p
        if outcome:
            # Wait que se passe-t-il si la question a des paramètres négatifs ; faudrait pas baisser le niveau de l'apprenant
            # Est-ce qu'on n'augmente que les composantes qui sont plus grandes ?
            self.theta += ALPHA * reg_relu(diff - self.theta)
            print('OK! ++ new theta:', self.theta)
            print('progress', self.theta - self.initial_theta)
        else:
            # Si ça se trouve, tu apprends moins quand tu réussis que quand tu ne réussis pas ; mais on verra
            self.theta += BETA * reg_relu(diff - self.theta)
            print('NOK. + new theta:', self.theta)
            print('progress', self.theta - self.initial_theta)
        return outcome


class Asker:
    def __init__(self, learner):
        self.learner = learner
        self.est_theta = np.random.uniform(-1, 1, size=R)
        self.history = []
        self.reward = 0
    
    def update(self):
        self.est_theta.append(estimate(self.history))  # Not defined yet

    def ask(self, question_id):
        diff = diffs[question_id]
        print('A ask diff:', diff)
        outcome = self.learner.reply(question_id)
        self.history.append((question_id, outcome))
        if outcome:
            self.est_theta += ALPHA * reg_relu(diff - self.est_theta)
            self.reward += ALPHA * (diff - self.est_theta)
        else:
            self.est_theta += BETA * (diff - self.est_theta)
            self.reward += BETA * (diff - self.est_theta)
        print('A est_theta', self.est_theta)
        print('A reward', self.reward)


def scenario():
    learner = Learner()
    asker = Asker(learner)
    for question_id in range(5):
        asker.ask(question_id)

In [53]:
scenario()

L true theta: [-0.13070304 -0.26411359  0.20363993]
A ask diff: [ 0.8669205   0.79360646  0.85838177]
L proba: 0.4630398939494834
NOK. + new theta: [ 0.00044946 -0.12853529  0.31095726]
progress [ 0.13115249  0.1355783   0.10731733]
A est_theta [-0.41337849  0.90749926 -0.51003601]
A reward [ 0.1280299  -0.01138928  0.13684178]
A ask diff: [ 0.47208788  0.58392655  0.53579816]
L proba: 0.5229257499595928
OK! ++ new theta: [ 0.38305657  0.31607675  0.63570668]
progress [ 0.51375961  0.58019034  0.43206675]
A est_theta [ 0.07895869  1.12525588  0.02875368]
A reward [ 0.28528157 -0.22792101  0.33965957]
A ask diff: [ 0.76009355  0.11796853  0.56032348]
L proba: 0.6647751395832818
NOK. + new theta: [ 0.47298967  0.37597584  0.70132326]
progress [ 0.60369271  0.64008943  0.49768332]
A est_theta [ 0.14707218  1.02452714  0.08191066]
A reward [ 0.34658371 -0.31857687  0.38750086]
A ask diff: [ 0.67568513  0.24376345  0.23831484]
L proba: 0.6406938671750343
OK! ++ new theta: [ 0.7928384   0.62