In [73]:
%load_ext autoreload

%autoreload 2

import tools as t
import readdata as rm
import predictive_model as m
import evaluate as e

from lentil import models
from lentil import evaluate
from lentil import datatools

import pandas as pd
import numpy as np
import pickle
import math
import constants

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Read Raw Data and Generate Features into 'processed_data.txt'

In [None]:
rm.getTrainingInstances('raw_data/spanish_data.csv','spanish_processed.txt', 0, 2, 1, 3)

In [None]:
rm.getTrainingInstances('raw_data/radical_transfer_2007.csv','radical_processed.txt', 0, 3, 2, 4)

In [None]:
rm.getTrainingInstances('raw_data/chinese-hundredthousand.csv','chinese_processed.txt', 0, 3, 2, 4, 'pickles/history.pkl')

Convert processed data into Interaction History object

In [None]:
spanish_history = t.textToInteractionHistory('spanish_processed.txt', 'timestamp', 'student_id','module_id','outcome', 'True')

t.savePickle(spanish_history, 'datasets/spanish.pkl')

In [None]:
radical_history = t.textToInteractionHistory('radical_processed.txt', 'timestamp', 'student_id','module_id','outcome', 'True')

filtered_radical = t.filterHistory(radical_history.data)
t.savePickle(filtered_radical, 'datasets/radical.pkl')

In [None]:
chinese_history = t.textToInteractionHistory('chinese_processed.txt', 'timestamp', 'student_id','module_id','outcome', 'True')

t.savePickle(chinese_history, 'datasets/chinese.pkl')

Load ready-to-go datasets from pickles

In [4]:
spanish_history = t.loadPickle('datasets/spanish.pkl')

In [5]:
radical_history = t.loadPickle('datasets/radical.pkl')

In [6]:
chinese_history = t.loadPickle('datasets/chinese.pkl')

In [7]:
chinese_2007_history = t.loadPickle('datasets/chinese_2007.pkl')

In [8]:
mnemosyne_history = t.loadPickle('datasets/mnemosyne.pkl')

In [9]:
chinese_spring = t.loadPickle('datasets/chinese_spring.pkl')

In [10]:
french_history = t.loadPickle('datasets/french.pkl')

Check properties of data

In [None]:
t.getPropertiesOfData(spanish_history, 'SPANISH')

In [None]:
t.getPropertiesOfData(radical_history, 'RADICAL')

In [None]:
t.getPropertiesOfData(chinese_history, 'CHINESE 2006')

In [None]:
t.getPropertiesOfData(chinese_2007_history, 'CHINESE 2007')

In [None]:
t.getPropertiesOfData(mnemosyne_history, 'MNEMOSYNE')

In [None]:
t.getPropertiesOfData(french_history, 'FRENCH')

Online Prediction Accuracy: Spanish, Radical and Chinese Datasets

Training population: 20%; Testing population: 80%

In [11]:
#split history
train_chinese, test_chinese = t.splitHistory(chinese_history, 70)

In [12]:
train_spanish, test_spanish = t.splitHistory(spanish_history, 70)

In [13]:
train_radical, test_radical = t.splitHistory(radical_history, 70)

In [14]:
train_chinese_2007, test_chinese_2007 = t.splitHistory(chinese_2007_history, 70)

In [15]:
train_mnemosyne, test_mnemosyne = t.splitHistory(mnemosyne_history, 70)

In [16]:
train_french, test_french = t.splitHistory(french_history, 70)

Set train and test datasets

In [17]:
total, train, test = chinese_2007_history, train_chinese_2007, test_chinese_2007

Train models

In [18]:
efc_model_1 = e.meta_train_efc(train.data, using_delay=True, strength_var = 'numreviews')


internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.



In [19]:
efc_model_2 = e.meta_train_efc(train.data, using_delay=False, strength_var = 'ml')

In [20]:
efc_model_3 = e.meta_train_efc(train.data, using_delay=True, strength_var = 'expo')

In [21]:
efc_model_4 = e.meta_train_efc(train.data, using_delay=True, strength_var = 'correct')

In [34]:
onepl_model = e.train_onepl(total.data, total.data)

In [None]:
random_model = e.train_random(train.data, train.data)

In [28]:
percentage_model = e.train_percentage(train.data, train.data)

In [29]:
logistic_model_notime = e.train_logistic(train.data, train.data, using_time=False)

In [30]:
logistic_model_time = e.train_logistic(train.data, train.data, using_time=True)

Evaluate EFC Models

In [None]:
evaluate.training_auc(efc_model_1, test, True)

In [None]:
evaluate.training_auc(efc_model_2, test, True)

In [None]:
evaluate.training_auc(efc_model_3, test, True)

In [None]:
evaluate.training_auc(efc_model_4, test, True)

In [None]:
print evaluate.training_auc(logistic_model_time, test, False)
print evaluate.training_auc(logistic_model_notime, test, False)

Evaluate Other Models

In [None]:
evaluate.training_auc(logistic_model, test)

In [None]:
perc_auc = evaluate.training_auc(percentage_model, test, True)
random_auc = evaluate.training_auc(random_model, test, True)

In [38]:
e.getMetrics(onepl_model, test.data)

[[ 2040  2992]
 [  653 20556]]
0.687308294668


In [48]:
e.getMetrics(efc_model_1, test.data)

[[  792  4240]
 [ 1842 19367]]
0.535271382301


In [49]:
e.getMetrics(efc_model_2, test.data)

[[ 1312  3720]
 [  729 20480]]
0.613179559537


In [50]:
e.getMetrics(efc_model_3, test.data)

[[ 3192  1840]
 [ 5297 15912]]
0.692293879499


In [45]:
e.getMetrics(logistic_model_notime, test.data)

[[ 1575  3457]
 [  416 20793]]
0.646691252836


In [46]:
e.getMetrics(logistic_model_time, test.data)

[[    0  5032]
 [    0 21209]]
0.5


In [127]:
Y = list(int(round(i)) for i in total.data['outcome'])
x = total.data[['time_elapsed', 'history_correct', 'history_wrong', 'exponential', 'right_streak','wrong_streak', 'average_outcome', 'average_time']]

fv = []
for i in range(len(constants.FEATURE_NAMES)):
    current_feature_df = x[constants.FEATURE_NAMES[i]]
    single_feature_list = list(current_feature_df)
    if '.' not in single_feature_list[1]:
        single_feature_list = [int(i) for i in single_feature_list] 
    else:
        single_feature_list = [float(i) for i in single_feature_list] 
    fv.append(single_feature_list)

print len(fv[5])

81199


Generate Results

In [None]:
results = e.getResults(total, 5, True)

In [None]:
t.savePickle(results, 'results/chinese_2007_5.pkl')

Read Results

In [None]:
results = t.loadPickle('results/chinese_2007_5.pkl')

In [None]:
e.overallAccuracy(['EFC ML', 'EFC REVIEWS', 'EFC CORRECT', 'EFC EXPO','LR TIME','LR','PERC','IRT'], results, 'AUC', 'Chinese 2007', False)