## Combining dataframes and model fitting


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import csv files
train = pd.read_csv(
    'train.csv', 
    low_memory=False, 
    nrows=10**6, 
    dtype={
        'row_id': 'int64', 
        'timestamp': 'int64', 
        'user_id': 'int32', 
        'content_id': 'int16', 
        'content_type_id': 'int8',
        'task_container_id': 'int16', 
        'user_answer': 'int8', 
        'answered_correctly': 'int8', 
        'prior_question_elapsed_time': 'float32', 
        'prior_question_had_explanation': 'boolean'
    }
)
question = pd.read_csv('questions.csv')
lecture = pd.read_csv('lectures.csv')
sample_test = pd.read_csv('example_test.csv')

In [21]:
train = train[train.content_type_id ==0]
train

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
999995,999995,26482248,20949024,8803,0,29,1,1,14000.0,True
999996,999996,26516686,20949024,4664,0,30,3,1,17000.0,True
999997,999997,26537967,20949024,4108,0,31,1,0,18000.0,True
999998,999998,26590240,20949024,5014,0,32,3,0,6000.0,True


In [37]:
# get aggregated data for each user_id
std_accu = train[train.answered_correctly != -1].groupby('user_id')['answered_correctly'].mean().reset_index()
time_total = train[train.answered_correctly != -1].groupby('user_id')['prior_question_elapsed_time'].max().reset_index()
action_total = train.groupby('user_id')['content_id'].count().reset_index()

In [45]:
std_accu.columns = ('user_id', 'std_accu')
time_total.columns = ('user_id', 'time_total')
action_total.columns = ('user_id', 'action_total')
time_total

Unnamed: 0,user_id,time_total
0,115,55000.0
1,124,33333.0
2,2746,28000.0
3,5382,201000.0
4,8623,95000.0
...,...,...
3819,20913319,154000.0
3820,20913864,40000.0
3821,20938253,243000.0
3822,20948951,47000.0


In [39]:
std_accu

Unnamed: 0,user_id,std_accu
0,115,0.695652
1,124,0.233333
2,2746,0.578947
3,5382,0.672000
4,8623,0.642202
...,...,...
3819,20913319,0.630653
3820,20913864,0.333333
3821,20938253,0.608779
3822,20948951,0.607843


In [2]:
question.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [19]:
# get accuracy rate for each question
train_q = train[train.content_type_id == 0]
# train_q['answered_correctly'].min()
accu_q = train_q.groupby('content_id').answered_correctly.mean().reset_index()
accu_q

Unnamed: 0,content_id,answered_correctly
0,0,0.863014
1,1,0.927273
2,2,0.560811
3,3,0.798995
4,4,0.602606
...,...,...
13071,13518,0.750000
13072,13519,0.555556
13073,13520,0.700000
13074,13521,0.857143


In [29]:
train_exp = train[['user_id', 'timestamp',
                  'content_id', 'answered_correctly', 
                  'prior_question_elapsed_time', 'prior_question_elapsed_time']]


In [46]:
# join on user_id

train_exp = train_exp.join(std_accu.set_index('user_id'), on= 'user_id')

In [47]:
train_exp = train_exp.join(time_total.set_index('user_id'), on= 'user_id')

In [48]:
train_exp = train_exp.join(action_total.set_index('user_id'), on='user_id')
train_exp

Unnamed: 0,user_id,timestamp,content_id,answered_correctly,prior_question_elapsed_time,prior_question_elapsed_time.1,std_accu,time_total,action_total
0,115,0,5692,1,,,0.695652,55000.0,46
1,115,56943,5716,1,37000.0,37000.0,0.695652,55000.0,46
2,115,118363,128,1,55000.0,55000.0,0.695652,55000.0,46
3,115,131167,7860,1,19000.0,19000.0,0.695652,55000.0,46
4,115,137965,7922,1,11000.0,11000.0,0.695652,55000.0,46
...,...,...,...,...,...,...,...,...,...
999995,20949024,26482248,8803,1,14000.0,14000.0,0.312500,94000.0,48
999996,20949024,26516686,4664,1,17000.0,17000.0,0.312500,94000.0,48
999997,20949024,26537967,4108,0,18000.0,18000.0,0.312500,94000.0,48
999998,20949024,26590240,5014,0,6000.0,6000.0,0.312500,94000.0,48


In [50]:
train_sample = train_exp.sample(n=50000, random_state=1)
train_sample

Unnamed: 0,user_id,timestamp,content_id,answered_correctly,prior_question_elapsed_time,prior_question_elapsed_time.1,std_accu,time_total,action_total
543389,11038066,14199666914,1253,1,18000.0,18000.0,0.702160,142000.0,2592
55899,1283420,6287796324,10655,1,20000.0,20000.0,0.828785,246000.0,7476
105089,2223671,491002068,6594,1,31750.0,31750.0,0.736301,79000.0,292
679880,13832598,1559002610,6004,0,31000.0,31000.0,0.550000,125000.0,380
782731,15956793,18808024091,258,1,19000.0,19000.0,0.672868,300000.0,645
...,...,...,...,...,...,...,...,...,...
553486,11412125,426685315,9704,1,39000.0,39000.0,0.663617,279000.0,3826
350138,6812883,9996944595,4927,1,37000.0,37000.0,0.489549,296000.0,2727
698978,14442037,6545038401,383,1,18000.0,18000.0,0.551036,199000.0,1352
486011,9418512,14831215922,7449,1,40666.0,40666.0,0.608869,300000.0,7261


In [53]:
# export sampled data
train_sample.to_csv('train_sample.csv', index = False)