In [None]:
# Half-Life as implemented by Duolingo 

In [None]:
import math
import numpy as np
from collections import defaultdict

In [None]:
# Constants
MIN_HALF_LIFE = 15.0 / (24 * 60)  # 15 minutes in days
MAX_HALF_LIFE = 274.0            # 9 months
LN2 = math.log(2)

# Utility functions
def pclip(p):
    """Clip recall probability to avoid numerical issues."""
    return min(max(p, 0.0001), 0.9999)

def hclip(h):
    """Clip half-life to a reasonable range."""
    return min(max(h, MIN_HALF_LIFE), MAX_HALF_LIFE)

def compute_half_life(p_recall, delta_t):
    """Compute target half-life from recall probability and time difference."""
    return hclip(-delta_t / math.log(p_recall, 2))


In [None]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')

In [None]:
df_1 = df.merge(df_words, on = 'lexeme_id', how='inner')
df_2 = df_1.merge(df_users, on = ['user_id', 'lang_combination'], how='inner')

In [None]:
dff = df_2.drop(columns=['timestamp', 'lexeme_id', 'word', 'user_id', 'POS', 'person', 'number', 'gender', 'tense', 'def'])

In [None]:
# Define the HLR Model
class HalfLifeRegression:
    def __init__(self, learning_rate=0.001, hlwt=0.01, l2wt=0.1):
        self.weights = defaultdict(float)  # Feature weights
        self.fcounts = defaultdict(int)    # Feature counts for adaptive learning rates
        self.learning_rate = learning_rate # Base learning rate
        self.hlwt = hlwt                   # Weight for half-life loss
        self.l2wt = l2wt                   # L2 regularization weight

    def halflife(self, fv):
        """Compute predicted half-life based on feature vector."""
        dp = sum(self.weights[k] * x_k for k, x_k in fv)
        return hclip(2 ** dp)

    def predict(self, fv, delta_t):
        """Predict recall probability and half-life."""
        h = self.halflife(fv)
        p = 2 ** (-delta_t / h)
        return pclip(p), h

    def train_instance(self, p_true, delta_t, fv):
        """Update weights using one training instance."""
        h_true = compute_half_life(p_true, delta_t)
        p_pred, h_pred = self.predict(fv, delta_t)

        # Compute gradients
        dlp_dw = 2 * (p_pred - p_true) * (LN2 ** 2) * p_pred * (delta_t / h_pred)
        dlh_dw = 2 * (h_pred - h_true) * LN2 * h_pred

        # Update weights
        for k, x_k in fv:
            rate = self.learning_rate / math.sqrt(1 + self.fcounts[k])
            self.weights[k] -= rate * dlp_dw * x_k  # Update for recall probability loss
            self.weights[k] -= rate * self.hlwt * dlh_dw * x_k  # Update for half-life loss
            self.weights[k] -= rate * self.l2wt * self.weights[k]  # L2 regularization
            self.fcounts[k] += 1

    def train(self, dataset):
        """Train the model using the dataset."""
        for instance in dataset:
            self.train_instance(instance['p'], instance['delta_t'], instance['fv'])

    def evaluate(self, dataset):
        """Evaluate the model on a test dataset."""
        total_slp, total_slh = 0, 0
        for instance in dataset:
            p_pred, h_pred = self.predict(instance['fv'], instance['delta_t'])
            slp = (instance['p'] - p_pred) ** 2
            slh = (compute_half_life(instance['p'], instance['delta_t']) - h_pred) ** 2
            total_slp += slp
            total_slh += slh
        print(f"SLP Loss: {total_slp}, SLH Loss: {total_slh}")

In [None]:
model = HalfLifeRegression()
model.train(dataset)
model.evaluate(dataset)