In [None]:
# Half-Life as implemented by Duolingo 

In [1]:
import math
import numpy as np
from collections import defaultdict, namedtuple
from sys import intern
import pandas as pd
import os

In [2]:
current_dir = os.getcwd()

filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')

df_1 = df.merge(df_words, on = 'lexeme_id', how='inner')
df_1['lang_combination'] = df_1['ui_language']+ '-' + df_1['learning_language']
df_2 = df_1.merge(df_users, on = ['user_id', 'lang_combination'], how='inner')
dff = df_2.drop(columns=['timestamp', 'lexeme_id', 'word', 'user_id', 'POS', 'person', 'number', 'gender', 'tense', 'def', 'session_seen', 'session_correct'])

In [138]:
# Constants
min_half_life = 15.0 / (24 * 60)  # 15 minutes in days
max_half_life = 274.0            # 9 months
LN2 = math.log(2)

# Utility functions
def pclip(p):
    """Clip recall probability to avoid numerical issues."""
    return min(max(p, 0.0001), 0.9999)

def hclip(h):
    """Clip half-life to a reasonable range."""
    return min(max(h, min_half_life), max_half_life)

def mae(l1, l2):
    # mean average error
    return mean([abs(l1[i] - l2[i]) for i in range(len(l1))])

def mean(lst):
    # the average of a list
    return float(sum(lst))/len(lst)

274.0


In [4]:
dff.columns

Index(['p_recall', 'delta', 'learning_language', 'ui_language', 'history_seen',
       'history_correct', 'h_recall', 'lang_combination', 'word_len',
       'tags_list', 'SUBTLEX', 'avg_user_p_recall', 'avg_delta', 'std_delta',
       'avg_h_recall'],
      dtype='object')

In [8]:
# Changes to dataset before fitting 
dff['p_recall'] = pclip(dff['p_recall'])
dff['delta'] = dff['delta']/(60*60*24) # convert time delta to days
dff['avg_delta'] = dff['avg_delta']/(60*60*24) 
dff['std_delta'] = dff['std_delta']/(60*60*24) 
dff['half_life'] = hclip(-dff['delta']/np.log2(dff['p_recall']))

tag_counts = dff['tags_list'].value_counts()
rare_threshold = 1000
dff['tags_list'] = dff['tags_list'].apply(lambda x: x if tag_counts[x] > rare_threshold else 'rare')


dff_final = dff.drop(columns=['learning_language_y', 'ui_language_y', 'learning_language_x', 'ui_language_x', 'avg_user_p_recall'], errors='ignore')
dff_final.dropna(inplace=True)

In [9]:
dff_final.head()

Unnamed: 0,p_recall,delta,learning_language,ui_language,history_seen,history_correct,h_recall,lang_combination,word_len,tags_list,SUBTLEX,avg_delta,std_delta,avg_h_recall,half_life
0,0.9999,5.1436,es,en,3,3,1.0,en-es,4,['pr'],111241.0,2475405.0,2879771.0,0.954897,274.0
1,0.9999,0.069016,de,en,8,6,0.75,en-de,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,3104.417,2977.079,0.890225,274.0
2,0.75,0.069016,de,en,6,5,0.833333,en-de,3,"['det', 'def', 'f', 'sg']",2484854.0,3104.417,2977.079,0.890225,0.166289
3,0.888889,0.069016,de,en,6,5,0.833333,en-de,4,"['n', 'm', 'sg']",222707.0,3104.417,2977.079,0.890225,0.406157
4,0.8,0.069016,de,en,8,6,0.75,en-de,4,"['n', 'f', 'sg']",143725.0,3104.417,2977.079,0.890225,0.214384


In [146]:
Instance = namedtuple('Instance', 'p_recall delta fv half_life'.split())

def create_instances_from_dataframe(df):
    instances = []
    for _, row in df.iterrows():
        # Build the feature vector for this row
        fv = []
        # fv.append((intern('history_seen'), np.sqrt(1 + row['history_seen'])))
        # fv.append((intern('history_correct'), np.sqrt(1 + row['history_correct'])))
        fv.append((intern('h_recall'), row['h_recall']))
        # fv.append((intern('word_len'), row['word_len']))
        # fv.append((intern('lang_comb:' + row['lang_combination']), 1.0))
        # fv.append((intern('avg_delta'), row['avg_delta']))
        # fv.append((intern('SUBTLEX'), row['SUBTLEX']))
        # fv.append((intern('std_delta'), row['std_delta']))
        # fv.append((intern('avg_h_recall'), row['avg_h_recall']))
        # fv.append((intern('tags_list:' + row['tags_list']), 1.0))

        instance = Instance(
            p_recall=row['p_recall'],
            delta=row['delta'],
            fv=fv,
            half_life=row['half_life']
        )

        instances.append(instance)
        
    splitpoint = int(0.8 * len(instances))
    return instances[:splitpoint], instances[splitpoint:]

In [147]:
from collections import defaultdict
import random
import math

class HalfLifeRegression:
    def __init__(self, learning_rate=0.001, hlwt=0.01, l2wt=0.1, sigma=1., initial_weights=None):
        self.weights = defaultdict(float)  # Feature weights
        self.fcounts = defaultdict(int)    # Feature counts for adaptive learning rates
        self.learning_rate = learning_rate # Base learning rate
        self.hlwt = hlwt                   # Weight for half-life loss
        self.l2wt = l2wt                   # L2 regularization weight
        self.sigma = sigma                 # Sigma value for L2 regularization
        if initial_weights is not None:
            self.weights.update(initial_weights)

    def halflife(self, inst):
        """Compute predicted half-life based on feature vector."""
        try:
            dp = sum([self.weights[k] * x_k for (k, x_k) in inst.fv])  # where inst.fv is the feature vector
            # dp = np.clip(dp, -50, 50)
            return hclip(2 ** dp)  
        except Exception as e:
            return max_half_life  # Return a default max value if an error occurs

    def predict(self, inst):
        """Predict recall probability and half-life."""
        h_pred = self.halflife(inst)
        p_pred = 2 ** (-inst.delta / h_pred)  # Calculate recall probability
        return pclip(p_pred), h_pred  # Clip probabilities within bounds

    
    def train_update(self, inst):
        """Update weights using one training instance."""
        p_pred, h_pred = self.predict(inst)

        # Compute gradients
        dlp_dw = 2 * (p_pred - inst.p_recall) * (LN2 ** 2) * p_pred * (inst.delta / h_pred)
        dlh_dw = 2 * (h_pred - inst.half_life) * LN2 * h_pred

        # Update weights
        for (k, x_k) in inst.fv:
            rate = (1. / (1 + inst.p_recall)) * self.learning_rate / math.sqrt(1 + self.fcounts[k])
            self.weights[k] -= rate * dlp_dw * x_k  # Update for recall probability loss
            self.weights[k] -= rate * self.hlwt * dlh_dw * x_k  # Update for half-life loss
            self.weights[k] -= rate * self.l2wt * self.weights[k] / self.sigma**2  # L2 regularization
            self.fcounts[k] += 1


    def train(self, trainset):
        random.shuffle(trainset)  # Shuffle the training set
        for inst in trainset:
            self.train_update(inst)

    def losses(self, inst):
        p_pred, h_pred = self.predict(inst)
        slp = (inst.p_recall - p_pred)**2
        slh = (inst.half_life - h_pred)**2
        return slp, slh, p_pred, h_pred

    def evaluate(self, testset):
        """Evaluate the model on a test dataset."""
        results = {'p': [], 'h': [], 'pp': [], 'hh': [], 'slp': [], 'slh': []}
        for inst in testset:
            slp, slh, p_pred, h_pred = self.losses(inst)

            results['p'].append(inst.p_recall)
            results['h'].append(inst.half_life)
            results['pp'].append(p_pred)
            results['hh'].append(h_pred)
            results['slp'].append(slp)
            results['slh'].append(slh)

        mae_p = mae(results['p'], results['pp'])
        mae_h = mae(results['h'], results['hh'])
        total_slp = sum(results['slp'])
        total_slh = sum(results['slh'])
        total_l2 = sum([x ** 2 for x in self.weights.values()])
        total_loss = total_slp + self.hlwt * total_slh + self.l2wt * total_l2

        print(f"SLP Loss: {total_slp}, SLH Loss: {total_slh}, MAE_P: {mae_p}, MAE_H: {mae_h}, Total Loss: {total_loss}")
    


In [149]:
trainset, testset = create_instances_from_dataframe(dff)
model = HalfLifeRegression()
model.train(trainset)
model.evaluate(testset)

SLP Loss: 701946.193645011, SLH Loss: 97000960736.84932, MAE_P: 0.3644486788817714, MAE_H: 154.11040641940403, Total Loss: 970711553.599706
