In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict
import dill
# import datatable as dt
import lightgbm as lgb
import os
import riiideducation
from catboost import CatBoostClassifier, Pool
from tqdm.notebook import tqdm
from bitarray import bitarray
import gc

_ = np.seterr(divide='ignore', invalid='ignore')

import psutil, time
start_time = time.time()
GB = pow(1024, 3)
mem_total = psutil.virtual_memory().total / GB
def show_mem_usage():
  percent_used = psutil.virtual_memory().percent
  mem_avail = psutil.virtual_memory().available / GB
  mem_used = mem_total - mem_avail
  cur_time = time.time()
  print("Elapsed time {:.2f} seconds".format(cur_time - start_time))
  print("Mem used: {:.2f}, percent used: {} mem avail {:.2f}".format(mem_used, percent_used, mem_avail))

DATA_PATH = '../input/riiid-submit'
target = 'answered_correctly'

# I. Son's model

## 1. Load dicts

In [2]:
questions_df = pd.read_parquet(DATA_PATH + '/questions_df.parquet')
ts_quantile = pickle.load(open(DATA_PATH + '/' + 'ts_quantile.pkl', 'rb'))
task_quantile = pickle.load(open(DATA_PATH + '/' + 'task_quantile.pkl', 'rb'))



In [3]:
loop_feat = pickle.load(open(DATA_PATH+'/loop_feat.pickle', "rb"))

user_sum_dict = loop_feat['user_sum_dict']
user_count_dict = loop_feat['user_count_dict']
content_sum_dict = loop_feat['content_sum_dict']
content_count_dict = loop_feat['content_count_dict']
user_part_sum_dict = loop_feat['user_part_sum_dict']
user_part_count_dict = loop_feat['user_part_count_dict']
user_lecture_part_count_dict = loop_feat['user_lecture_part_count_dict']
user_solving_count_dict = loop_feat['user_solving_count_dict']
user_concept_count_dict = loop_feat['user_concept_count_dict']
user_lecture_count_dict = loop_feat['user_lecture_count_dict']
user_rolling_sum_dict = loop_feat['user_rolling_sum_dict']
user_explanation_sum_dict = loop_feat['user_explanation_sum_dict']
user_explanation_count_dict = loop_feat['user_explanation_count_dict']

In [4]:
metadata = pickle.load(open(DATA_PATH+'/metadata.pickle', "rb"))

questions_part_dict = metadata['questions_part_dict']
lectures_part_dict = metadata['lectures_part_dict']
lectures_type_dict = metadata['lectures_type_dict']

questions_df = pd.read_parquet(DATA_PATH+'/questions_df.parquet')
content_agg = pd.read_parquet(DATA_PATH+'/content_agg.parquet')

In [5]:
time_lag_max_dict = pickle.load(open(DATA_PATH+'/time_lag_max_dict.pickle', "rb"))

In [6]:
user_explanation_dict = pickle.load(open(DATA_PATH+'/user_explanation_dict.pickle', "rb"))

explanation_agg_sum_dict = user_explanation_dict['explanation_agg_sum_dict']
explanation_agg_count_dict = user_explanation_dict['explanation_agg_count_dict']

In [7]:
elapsed_time_dict = pickle.load(open(DATA_PATH+'/elapsed_time_dict.pickle', "rb"))

last_bundle_dict = elapsed_time_dict['last_bundle_dict']
last_result_dict = elapsed_time_dict['last_result_dict']
elapsed_time_user_sum_dict = elapsed_time_dict['elapsed_time_user_sum_dict']
elapsed_time_user_count_dict = elapsed_time_dict['elapsed_time_user_count_dict']

In [8]:
question_time = pd.read_parquet(DATA_PATH+'/'+'question_time.parquet')
part_time = pd.read_parquet(DATA_PATH+'/'+'part_time.parquet')

In [9]:
content_part_bin_dict = pickle.load(open(DATA_PATH+'/content_part_bin_dict.pickle', "rb"))

content_bin_mean_dict = content_part_bin_dict['content_bin_mean_dict']
content_bin_sum_dict = content_part_bin_dict['content_bin_sum_dict']
content_bin_count_dict = content_part_bin_dict['content_bin_count_dict']
content_bin_std_dict = content_part_bin_dict['content_bin_std_dict']

part_bin_mean_dict = content_part_bin_dict['part_bin_mean_dict']
part_bin_sum_dict = content_part_bin_dict['part_bin_sum_dict']
part_bin_count_dict = content_part_bin_dict['part_bin_count_dict']
part_bin_std_dict = content_part_bin_dict['part_bin_std_dict']

In [10]:
users_dict = pickle.load(open(DATA_PATH+'/users_dict.pickle', "rb"))

In [11]:
user_question_part_attempt_dict = pickle.load(open(DATA_PATH+'/user_question_part_attempt_dict.pickle', "rb"))

user_attempt_sum_dict = user_question_part_attempt_dict['user_attempt_sum_dict']
user_attempt_count_dict = user_question_part_attempt_dict['user_attempt_count_dict']
content_attempt_sum_dict = user_question_part_attempt_dict['content_attempt_sum_dict']
content_attempt_count_dict = user_question_part_attempt_dict['content_attempt_count_dict']
part_attempt_sum_dict = user_question_part_attempt_dict['part_attempt_sum_dict']
part_attempt_count_dict = user_question_part_attempt_dict['part_attempt_count_dict']

In [12]:
woe_dict = pickle.load(open(DATA_PATH+'/woe_dict.pickle', "rb"))

woe_content_dict = woe_dict['woe_content_dict']
woe_part_dict = woe_dict['woe_part_dict']

In [13]:
df = pd.DataFrame(pd.cut(list(range(10000)),[0,30] + list(range(500,10001,500)),labels = list(range(0,21)),include_lowest=True), columns=['bin'])
task_bin_dict = df['bin'].astype('int8').to_dict(defaultdict(int))

In [14]:
# prior_question_elapsed_time_mean = train_df.prior_question_elapsed_time.dropna().values.mean()
# prior_question_elapsed_time_mean = 25526.652
prior_question_elapsed_time_mean = 25462.191

In [15]:
timestamp_user_dict = pickle.load(open(DATA_PATH+'/timestamp_user_dict.pickle', "rb"))

In [16]:
last_task_dict = pickle.load(open(DATA_PATH+'/last_task_dict.pickle', "rb"))

In [17]:
is_bundle_dict = pickle.load(open(DATA_PATH+'/is_bundle_dict.pickle', "rb"))

In [18]:
question_timelag_dict = pickle.load(open(DATA_PATH+'/question_timelag_dict.pickle', "rb"))
bundle_count = pickle.load(open(DATA_PATH+'/bundle_count.pickle', "rb"))
question_bundle = pickle.load(open(DATA_PATH+'/question_bundle.pickle', "rb"))

In [19]:
user_bundle_count_dict = pickle.load(open(DATA_PATH+'/user_bundle_count_dict.pickle', "rb"))
user_last_iteraction = pickle.load(open(DATA_PATH+'/user_last_iteraction.pickle', "rb"))

In [20]:
user_part_lag_dict = pickle.load(open(DATA_PATH+'/user_part_lag_dict.pickle', "rb"))

In [21]:
timestamp_part_user_dict = pickle.load(open(DATA_PATH+'/timestamp_part_user_dict.pickle', "rb"))

In [22]:
last_session_dict = pickle.load(open(DATA_PATH+'/last_session_dict.pickle', "rb"))
user_time_accumulate_dict = pickle.load(open(DATA_PATH+'/user_time_accumulate_dict.pickle', "rb"))

# last_session_sum_dict = pickle.load(open(DATA_PATH+'kaggle_submit/last_session_sum_dict.pickle', "rb"))
# last_session_count_dict = pickle.load(open(DATA_PATH+'kaggle_submit/last_session_count_dict.pickle', "rb"))
# last_2_session_sum_dict = pickle.load(open(DATA_PATH+'kaggle_submit/last_2_session_sum_dict.pickle', "rb"))
# last_2_session_count_dict = pickle.load(open(DATA_PATH+'kaggle_submit/last_2_session_count_dict.pickle', "rb"))

## 2. Load model

In [23]:
features = ['ts_bins',
 'task_bins',
 'prior_question_elapsed_time',
 'part',
 'prior_question_had_explanation',
 'WOE_content_id',
 'WOE_part',
 'content_id',
 'part_correct',
 'correct_answer_mean',
 'content_count',
 'content_std',
 'attempt_binary',
 'timelag_update',
 'tags_0',
 'tags_1',
 'tags_2',
 'lag_1',
 'lag_2',
 'lag_3',
 'user_correctness',
 'user_count',
 'user_sum',
 'user_part_correctness',
 'user_part_count',
 'user_lecture_part_count',
 'user_solving_count',
 'user_concept_count',
 'user_roll15_correctness',
 'user_roll15_std',
 'user_roll15_count',
 'user_roll30_correctness',
 'user_roll30_std',
 'user_roll30_count',
 'correct_ircorrect_roll15_ratio',
 'correct_ircorrect_roll30_ratio',
 'hmean_user_content_accuracy',
 'skip_task',
 'timelag_2',
 'timelag_3',
 'timelag_4',
 'timelag_5',
 'timelag_6',
 'is_bundle',
 'content_bin_mean',
 'content_bin_count',
 'content_bin_std',
 'part_bin_mean',
 'part_bin_count',
 'part_bin_std',
 'elapsed_time_user_mean',
 'elapsed_time_user_diff',
 'question_timelag_diff',
 'question_timelag_median',
 'explanation_sum',
 'explanation_count',
 'user_explanation_correctness',
 'question_time_mean_false_diff',
 'question_time_mean_true_diff',
 'question_time_median_false_diff',
 'question_time_median_true_diff',
 'part_time_mean_false_diff',
 'part_time_mean_true_diff',
 'part_time_median_false_diff',
 'part_time_median_true_diff',

 'user_attempt_sum',
 'user_attempt_count',
 'content_attempt_sum',
 'content_attempt_count',
 'part_attempt_sum',
 'part_attempt_count',
 'user_attempt_mean',
 'content_attempt_mean',
 'part_attempt_mean',

 'n_tags',
 'user_lecture_count',
 'user_explanation_mean',
 'tags_0_pct',
 'tags_1_pct',
 'tags_2_pct',
 'tags_mean',
 'tags_min',
 'tags_max',
 
 'timelag_part_1',
 'timelag_part_2',
 'timelag_part_3',
 'lag_part_1',
 'lag_part_2',
 'lag_part_3',
 'lag_part_mean',
 'user_bundle_count',
 'user_bundle_question_ratio',
 
 "user_session",
 "user_time_accumulate"]

In [24]:
len(features)

94

In [25]:
# model = CatBoostClassifier().load_model(fname='../input/riiid-model/catboost_tr0.7844256_va0.7801372')
model_Son = lgb.Booster(model_file='../input/riiid-model/lgbm-25-90m_v4.txt')

# II. Linh's model

## 1. Load dicts & features

### Dicts

In [26]:
def update_attempts_dict(df):
    for (user_id, content_id, content_type_id) in df[['user_id', 'content_id', 'content_type_id']].values:
        if content_type_id == 0:
            # update dict
            attempts_dict[user_id][content_id] += 1
        else:
            pass

In [27]:
%%time
train_df = pd.read_pickle('../input/riiid-cross-validation-files/cv1_train.pickle')[['user_id', 'content_id', 'content_type_id']]
valid_df = pd.read_pickle('../input/riiid-cross-validation-files/cv1_valid.pickle')[['user_id', 'content_id', 'content_type_id']]

# train_df = train_df[-1000000:]
# valid_df = valid_df[:10000]

attempts_dict = defaultdict(lambda: defaultdict(int))

update_attempts_dict(train_df)
del train_df
update_attempts_dict(valid_df)
del valid_df
gc.collect()

CPU times: user 10min 48s, sys: 16.1 s, total: 11min 4s
Wall time: 12min 3s


11

In [28]:
%%time
answered_user_count_dict = dill.load(open('../input/riiid-dicts/answered_user_count_dict_file', 'rb'))
answered_correctly_user_sum_dict = dill.load(open('../input/riiid-dicts/answered_correctly_user_sum_dict_file', 'rb'))

CPU times: user 699 ms, sys: 136 ms, total: 836 ms
Wall time: 1.25 s


In [29]:
%%time
lag1_dict = dill.load(open('../input/riiid-dicts/lag1_dict_file', 'rb'))
window_dict = dill.load(open('../input/riiid-dicts/window_dict_file', 'rb'))
wrong_answered_user_count_dict = dill.load(open('../input/riiid-dicts/wrong_answered_user_count_dict_file', 'rb'))

CPU times: user 2.04 s, sys: 367 ms, total: 2.41 s
Wall time: 3.83 s


In [30]:
%%time
answered_user_part_count_dict = dill.load(open('../input/riiid-dicts/answered_user_part_count_dict_file', 'rb'))
answered_correctly_user_part_sum_dict = dill.load(open('../input/riiid-dicts/answered_correctly_user_part_sum_dict_file', 'rb'))

CPU times: user 2.5 s, sys: 394 ms, total: 2.89 s
Wall time: 3.95 s


In [31]:
%%time
bundle_user_count_dict = dill.load(open('../input/riiid-dicts/bundle_user_count_dict_file', 'rb'))

CPU times: user 294 ms, sys: 28.8 ms, total: 322 ms
Wall time: 456 ms


In [32]:
%%time
lecture_user_count_dict = dill.load(open('../input/riiid-dicts/lecture_user_count_dict_file', 'rb'))

CPU times: user 274 ms, sys: 15 ms, total: 290 ms
Wall time: 415 ms


In [33]:
%%time
lecture_user_part_count_dict = dill.load(open('../input/riiid-dicts/lecture_user_part_count_dict_file', 'rb'))

CPU times: user 790 ms, sys: 25.8 ms, total: 816 ms
Wall time: 1.1 s


In [34]:
%%time
lecture_user_type_count_dict = dill.load(open('../input/riiid-dicts/lecture_user_type_count_dict_file', 'rb'))

CPU times: user 8.78 s, sys: 106 ms, total: 8.89 s
Wall time: 9.08 s


In [35]:
%%time
prior_elapsed_time_sum_dict = dill.load(open('../input/riiid-dicts/prior_elapsed_time_sum_dict_file', 'rb'))
prior_elapsed_time_dict = dill.load(open('../input/riiid-dicts/prior_elapsed_time_dict_file', 'rb'))

CPU times: user 894 ms, sys: 113 ms, total: 1.01 s
Wall time: 1.55 s


In [36]:
%%time
prior_explanation_sum_dict = dill.load(open('../input/riiid-dicts/prior_explanation_sum_dict_file', 'rb'))

CPU times: user 451 ms, sys: 70.3 ms, total: 521 ms
Wall time: 787 ms


In [37]:
%%time
task_dict = dill.load(open('../input/riiid-dicts/task_dict_file', 'rb'))
task_count_arr_dict = dill.load(open('../input/riiid-dicts/task_count_arr_dict_file', 'rb'))
task_content_number_dict = dill.load(open('../input/riiid-dicts/task_content_number_dict_file', 'rb'))

CPU times: user 980 ms, sys: 152 ms, total: 1.13 s
Wall time: 1.62 s


In [38]:
%%time
timestamp_lag1_dict = dill.load(open('../input/riiid-dicts/timestamp_lag1_dict_file', 'rb'))
timestamp_window_dict = dill.load(open('../input/riiid-dicts/timestamp_window_dict_file', 'rb'))

CPU times: user 1.74 s, sys: 253 ms, total: 2 s
Wall time: 2.74 s


### Basic features

In [39]:
prior_question_elapsed_time_mean_Linh = 25439.41

In [40]:
content_part_dict = dill.load(open('../input/riiid-dicts/content_part_dict_file', 'rb'))

In [41]:
lecture_part_dict = dill.load(open('../input/riiid-dicts/lecture_part_dict_file', 'rb'))
lecture_type_dict = dill.load(open('../input/riiid-dicts/lecture_type_dict_file', 'rb'))

### General stats features

In [42]:
%%time
questions_df_Linh = pd.read_pickle('../input/riiid-basic-general-stats-features/questions_df.pkl')
content_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_df.pkl')
content_part_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_part_df.pkl')
content_bundle_bins_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_bundle_bins_df.pkl')
content_tags1_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_tags1_df.pkl')
content_prior_explanation_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_prior_explanation_df.pkl')
content_elapsed_time_df = pd.read_pickle('../input/riiid-basic-general-stats-features/content_elapsed_time_df.pkl')

CPU times: user 10.7 ms, sys: 3.07 ms, total: 13.8 ms
Wall time: 137 ms


In [43]:
questions_df_Linh

Unnamed: 0_level_0,bundle_id,part,tags_number,tags1,tags2,tags3,tags4,community
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,4,51,131,162,38,2
1,1,1,3,131,36,81,0,2
2,2,1,4,131,101,162,92,2
3,3,1,4,131,149,162,29,2
4,4,1,4,131,5,162,38,2
...,...,...,...,...,...,...,...,...
13518,13518,5,1,14,0,0,0,0
13519,13519,5,1,8,0,0,0,1
13520,13520,5,1,73,0,0,0,1
13521,13521,5,1,125,0,0,0,0


In [44]:
content_tags1_df

Unnamed: 0_level_0,answered_correctly_content_tags1_mean,answered_correctly_content_tags1_std
tags1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.000000,
1,0.608464,0.488094
2,0.690732,0.462192
4,0.641899,0.479442
7,0.623096,0.484611
...,...,...
179,0.640975,0.479715
180,0.661048,0.473354
181,0.630708,0.482613
182,0.697525,0.459331


### Loop features

In [45]:
def add_user_features_without_update(df):
    answered_user_count_arr = np.zeros(len(df), dtype=np.int32)
    answered_correctly_user_sum_arr = np.zeros(len(df), dtype=np.int32)
    answered_correctly_user_mean_arr = np.zeros(len(df), dtype=np.float32)

    lag1_arr = np.zeros(len(df), dtype=np.int8)
    lag2_arr = np.zeros(len(df), dtype=np.int8)
    lag3_arr = np.zeros(len(df), dtype=np.int8)
    lag4_arr = np.zeros(len(df), dtype=np.int8)
    lag5_arr = np.zeros(len(df), dtype=np.int8)
    rolling5_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling5_std_arr = np.zeros(len(df), dtype=np.float32)
    rolling15_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling15_std_arr = np.zeros(len(df), dtype=np.float32)
    rolling30_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling30_std_arr = np.zeros(len(df), dtype=np.float32)
    wrong_answered_user_count_arr = np.zeros(len(df), dtype=np.int8)

    answered_user_part1_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part2_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part3_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part4_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part5_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part6_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part7_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part1_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part2_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part3_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part4_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part5_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part6_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part7_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part_mean_arr = np.zeros(len(df), dtype=np.float32)

    bundle_user_count_arr = np.zeros(len(df), dtype=np.int32)
    bundle_user_count_bins_arr = np.zeros(len(df), dtype=np.int8)
    
    attempts_arr = np.zeros(len(df), dtype=np.int8)

    lecture_user_count_arr = np.zeros(len(df), dtype=np.int16)

    lecture_user_part1_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part2_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part3_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part4_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part5_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part6_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part7_count_arr = np.zeros(len(df), dtype=np.int8)

    lecture_user_concept_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_solving_question_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_intention_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_starter_count_arr = np.zeros(len(df), dtype=np.int8)

    prior_elapsed_time_sum_arr = np.zeros(len(df), dtype=np.float32)
    prior_elapsed_time_diff_arr = np.zeros(len(df), dtype=np.float32)

    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(False).astype(np.int8)
    prior_explanation_sum_arr = np.zeros(len(df), dtype=np.int32)

    task_count_arr = np.zeros(len(df), dtype=np.int32)
    task_content_number_arr = np.zeros(len(df), dtype=np.int8)

    timestamp_lag1_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag2_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag3_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag4_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag5_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff2_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff3_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff4_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff5_arr = np.zeros(len(df), dtype=np.float32)

    for idx, (timestamp, user_id, content_id, content_type_id, task_container_id, prior_question_elapsed_time) in enumerate(df[['timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 'prior_question_elapsed_time']].values):
        # add feature
        lecture_user_count_arr[idx] = lecture_user_count_dict[user_id]

        # add feature
        lecture_user_part1_count_arr[idx] = lecture_user_part_count_dict[user_id][1]
        lecture_user_part2_count_arr[idx] = lecture_user_part_count_dict[user_id][2]
        lecture_user_part3_count_arr[idx] = lecture_user_part_count_dict[user_id][3]
        lecture_user_part4_count_arr[idx] = lecture_user_part_count_dict[user_id][4]
        lecture_user_part5_count_arr[idx] = lecture_user_part_count_dict[user_id][5]
        lecture_user_part6_count_arr[idx] = lecture_user_part_count_dict[user_id][6]
        lecture_user_part7_count_arr[idx] = lecture_user_part_count_dict[user_id][7]

        # add feature
        lecture_user_concept_count_arr[idx] = lecture_user_type_count_dict[user_id]['concept']
        lecture_user_solving_question_count_arr[idx] = lecture_user_type_count_dict[user_id]['solving_question']
        lecture_user_intention_count_arr[idx] = lecture_user_type_count_dict[user_id]['intention']
        lecture_user_starter_count_arr[idx] = lecture_user_type_count_dict[user_id]['starter']
        
        if content_type_id == 0:
            # add feature
            answered_user_count_arr[idx] = answered_user_count_dict[user_id]
            answered_correctly_user_sum_arr[idx] = answered_correctly_user_sum_dict[user_id]
            if answered_user_count_arr[idx] != 0:
                answered_correctly_user_mean_arr[idx] = answered_correctly_user_sum_arr[idx] / answered_user_count_arr[idx]

            # add feature
            lag1_arr[idx] = lag1_dict[user_id]
            window_id = np.append(window_dict[user_id], lag1_dict[user_id])
            window_id = window_id[-30:]
            if len(window_id) >= 2:
                lag2_arr[idx] = window_id[-2]
            if len(window_id) >= 3:
                lag3_arr[idx] = window_id[-3]
            if len(window_id) >= 4:
                lag4_arr[idx] = window_id[-4]
            if len(window_id) >= 5:
                lag5_arr[idx] = window_id[-5]
            rolling5_mean_arr[idx] = np.mean(window_id[-5:])
            rolling5_std_arr[idx] = np.std(window_id[-5:])
            rolling15_mean_arr[idx] = np.mean(window_id[-15:])
            rolling15_std_arr[idx] = np.std(window_id[-15:])
            rolling30_mean_arr[idx] = np.mean(window_id[-30:])
            rolling30_std_arr[idx] = np.std(window_id[-30:])
            
            if lag1_dict[user_id] == 0:
                # add feature
                wrong_answered_user_count_arr[idx] = wrong_answered_user_count_dict[user_id]
            else:
                pass

            part_id = content_part_dict[content_id]
            # add feature
            answered_user_part1_count_arr[idx] = answered_user_part_count_dict[user_id][1]
            answered_user_part2_count_arr[idx] = answered_user_part_count_dict[user_id][2]
            answered_user_part3_count_arr[idx] = answered_user_part_count_dict[user_id][3]
            answered_user_part4_count_arr[idx] = answered_user_part_count_dict[user_id][4]
            answered_user_part5_count_arr[idx] = answered_user_part_count_dict[user_id][5]
            answered_user_part6_count_arr[idx] = answered_user_part_count_dict[user_id][6]
            answered_user_part7_count_arr[idx] = answered_user_part_count_dict[user_id][7]
            answered_correctly_user_part1_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][1]
            answered_correctly_user_part2_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][2]
            answered_correctly_user_part3_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][3]
            answered_correctly_user_part4_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][4]
            answered_correctly_user_part5_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][5]
            answered_correctly_user_part6_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][6]
            answered_correctly_user_part7_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][7]
            if answered_user_part_count_dict[user_id][part_id] != 0:
                answered_correctly_user_part_mean_arr[idx] = answered_correctly_user_part_sum_dict[user_id][part_id] / answered_user_part_count_dict[user_id][part_id]
            
            # add feature
            bundle_user_count_arr[idx] = bundle_user_count_dict[user_id]
            if bundle_user_count_arr[idx] > 30 and bundle_user_count_arr[idx] <= 125:
                bundle_user_count_bins_arr[idx] = 1
            if bundle_user_count_arr[idx] > 125 and bundle_user_count_arr[idx] <= 487:
                bundle_user_count_bins_arr[idx] = 2
            if bundle_user_count_arr[idx] > 487 and bundle_user_count_arr[idx] <= 1451:
                bundle_user_count_bins_arr[idx] = 3
            if bundle_user_count_arr[idx] > 1451:
                bundle_user_count_bins_arr[idx] = 4
            
            # add feature
            attempts_arr[idx] = attempts_dict[user_id][content_id]

            if prior_question_elapsed_time != prior_question_elapsed_time: # check nan
                # add feature
                prior_elapsed_time_sum_arr[idx] = prior_elapsed_time_sum_dict[user_id]
                prior_elapsed_time_diff_arr[idx] = prior_elapsed_time_dict[user_id]
            else:
                # add feature
                prior_elapsed_time_sum_arr[idx] = prior_question_elapsed_time/(1000*60) + prior_elapsed_time_sum_dict[user_id]
                prior_elapsed_time_diff_arr[idx] = prior_question_elapsed_time/(1000*60) - prior_elapsed_time_dict[user_id]
            
            # add feature
            prior_explanation_sum_arr[idx] = prior_explanation_sum_dict[user_id]
        else:
            pass

        if task_container_id == task_dict[user_id]:
            if task_count_arr_dict[user_id] > 0:
                # add feature
                task_count_arr[idx] = task_count_arr_dict[user_id] - 1
                task_content_number_arr[idx] = task_content_number_dict[user_id] + 1
        else:
            # add feature
            task_count_arr[idx] = task_count_arr_dict[user_id]
            task_content_number_arr[idx] = 1
            
        # add feature
        timestamp_lag1_arr[idx] = timestamp_lag1_dict[user_id]
        timestamp_window_id = np.append(timestamp_window_dict[user_id], timestamp_lag1_dict[user_id])
        timestamp_window_id = timestamp_window_id[-5:]
        
        if len(timestamp_window_id) > 1:
            timestamp_diff_arr[idx] = timestamp/(1000*60) - timestamp_lag1_arr[idx]
        if len(timestamp_window_id) > 2:
            timestamp_lag2_arr[idx] = timestamp_window_id[-2]
            timestamp_diff2_arr[idx] = timestamp/(1000*60) - timestamp_lag2_arr[idx]
        if len(timestamp_window_id) > 3:
            timestamp_lag3_arr[idx] = timestamp_window_id[-3]
            timestamp_diff3_arr[idx] = timestamp/(1000*60) - timestamp_lag3_arr[idx]
        if len(timestamp_window_id) > 4:
            timestamp_lag4_arr[idx] = timestamp_window_id[-4]
            timestamp_diff4_arr[idx] = timestamp/(1000*60) - timestamp_lag4_arr[idx]
            if timestamp_window_id[-5] != 0:
                timestamp_lag5_arr[idx] = timestamp_window_id[-5]
                timestamp_diff5_arr[idx] = timestamp/(1000*60) - timestamp_lag5_arr[idx]

    user_feats_df = pd.DataFrame({
        'answered_user_count': answered_user_count_arr,
        'answered_correctly_user_sum': answered_correctly_user_sum_arr,
        'answered_correctly_user_mean': answered_correctly_user_mean_arr,

        'lag1': lag1_arr,
        'lag2': lag2_arr,
        'lag3': lag3_arr,
        'lag4': lag4_arr,
        'lag5': lag5_arr,
        'rolling5_mean': rolling5_mean_arr,
        'rolling5_std': rolling5_std_arr,
        'rolling15_mean': rolling15_mean_arr,
        'rolling15_std': rolling15_std_arr,
        'rolling30_mean': rolling30_mean_arr,
        'rolling30_std': rolling30_std_arr,
        'wrong_answered_user_count': wrong_answered_user_count_arr,

        'answered_user_part1_count': answered_user_part1_count_arr,
        'answered_user_part2_count': answered_user_part2_count_arr,
        'answered_user_part3_count': answered_user_part3_count_arr,
        'answered_user_part4_count': answered_user_part4_count_arr,
        'answered_user_part5_count': answered_user_part5_count_arr,
        'answered_user_part6_count': answered_user_part6_count_arr,
        'answered_user_part7_count': answered_user_part7_count_arr,
        'answered_correctly_user_part1_sum': answered_correctly_user_part1_sum_arr,
        'answered_correctly_user_part2_sum': answered_correctly_user_part2_sum_arr,
        'answered_correctly_user_part3_sum': answered_correctly_user_part3_sum_arr,
        'answered_correctly_user_part4_sum': answered_correctly_user_part4_sum_arr,
        'answered_correctly_user_part5_sum': answered_correctly_user_part5_sum_arr,
        'answered_correctly_user_part6_sum': answered_correctly_user_part6_sum_arr,
        'answered_correctly_user_part7_sum': answered_correctly_user_part7_sum_arr,
        'answered_correctly_user_part_mean': answered_correctly_user_part_mean_arr,

        # 'bundle_user_count': bundle_user_count_arr,
        'bundle_user_count_bins': bundle_user_count_bins_arr,

        'attempts': attempts_arr,

        'lecture_user_count': lecture_user_count_arr,

        'lecture_user_part1_count': lecture_user_part1_count_arr,
        'lecture_user_part2_count': lecture_user_part2_count_arr,
        'lecture_user_part3_count': lecture_user_part3_count_arr,
        'lecture_user_part4_count': lecture_user_part4_count_arr,
        'lecture_user_part5_count': lecture_user_part5_count_arr,
        'lecture_user_part6_count': lecture_user_part6_count_arr,
        'lecture_user_part7_count': lecture_user_part7_count_arr,

        'lecture_user_concept_count': lecture_user_concept_count_arr,
        'lecture_user_solving_question_count': lecture_user_solving_question_count_arr,
        'lecture_user_intention_count': lecture_user_intention_count_arr,
        # 'lecture_user_starter_count': lecture_user_starter_count_arr,

        'prior_elapsed_time_sum': prior_elapsed_time_sum_arr,
        'prior_elapsed_time_diff': prior_elapsed_time_diff_arr,

        'prior_explanation_sum': prior_explanation_sum_arr,

        'task_count': task_count_arr,
        'task_content_number': task_content_number_arr,

        'timestamp_lag1': timestamp_lag1_arr,
        'timestamp_lag2': timestamp_lag2_arr,
        'timestamp_lag3': timestamp_lag3_arr,
        'timestamp_lag4': timestamp_lag4_arr,
        'timestamp_lag5': timestamp_lag5_arr,
        'timestamp_diff': timestamp_diff_arr,
        'timestamp_diff2': timestamp_diff2_arr,
        'timestamp_diff3': timestamp_diff3_arr,
        'timestamp_diff4': timestamp_diff4_arr,
        'timestamp_diff5': timestamp_diff5_arr
    })

    df = df.reset_index(drop=True)
    df = pd.concat([df, user_feats_df], axis=1)

    return df

In [46]:
def update_user_features(df):
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(False).astype(np.int8)
    
    for (timestamp, user_id, content_id, content_type_id, answered_correctly, task_container_id, prior_question_elapsed_time, prior_question_had_explanation) in df[['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'task_container_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']].values:
        if content_type_id == 0:
            if lag1_dict[user_id] == 0:
                # update dict
                wrong_answered_user_count_dict[user_id] += 1
            else:
                # update dict
                wrong_answered_user_count_dict[user_id] = 1
            
            window_id = np.append(window_dict[user_id], lag1_dict[user_id])
            window_id = window_id[-30:]
            # update dict
            window_dict[user_id] = window_id

            # update dict
            lag1_dict[user_id] = answered_correctly
            answered_user_count_dict[user_id] += 1
            answered_correctly_user_sum_dict[user_id] += answered_correctly

            part_id = content_part_dict[content_id]
            # update dict
            answered_user_part_count_dict[user_id][part_id] += 1
            answered_correctly_user_part_sum_dict[user_id][part_id] += answered_correctly

            # update dict
            bundle_user_count_dict[user_id] += 1
            
            # update dict
            attempts_dict[user_id][content_id] += 1

            if prior_question_elapsed_time != prior_question_elapsed_time: # check nan
                pass
            else:
                # update dict
                prior_elapsed_time_sum_dict[user_id] += prior_question_elapsed_time/(1000*60)
                prior_elapsed_time_dict[user_id] = prior_question_elapsed_time/(1000*60)
            
            # update dict
            prior_explanation_sum_dict[user_id] += prior_question_had_explanation
        else:
            # update dict
            lecture_user_count_dict[user_id] += 1

            lect_part_id = lecture_part_dict[content_id]
            # update dict
            lecture_user_part_count_dict[user_id][lect_part_id] += 1

            type_id = lecture_type_dict[content_id]
            # update dict
            lecture_user_type_count_dict[user_id][type_id] += 1
        
        if task_container_id == task_dict[user_id]:
            if task_count_arr_dict[user_id] > 0:
                # update dict
                task_content_number_dict[user_id] += 1
        else:
            # update dict
            task_count_arr_dict[user_id] += 1
            task_content_number_dict[user_id] = 1
        # update dict
        task_dict[user_id] = task_container_id

        timestamp_window_id = np.append(timestamp_window_dict[user_id], timestamp_lag1_dict[user_id])
        timestamp_window_id = timestamp_window_id[-5:]
        # update dict
        timestamp_window_dict[user_id] = timestamp_window_id
        timestamp_lag1_dict[user_id] = timestamp/(1000*60)

In [47]:
def add_other_features(df):
    df = df[df['content_type_id'] == 0].reset_index(drop=True)
    
    df = pd.concat(
        [df, questions_df_Linh.reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    df = pd.concat(
        [df, content_df.reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    df = pd.concat(
        [df, content_part_df.reindex(df['part']).reset_index(drop=True)],
        axis=1
    )
    df = pd.concat(
        [df, content_bundle_bins_df.reindex(df['bundle_user_count_bins']).reset_index(drop=True)],
        axis=1
    )
    df = pd.concat(
        [df, content_tags1_df.reindex(df['tags1']).reset_index(drop=True)],
        axis=1
    )
    # df = pd.merge(
    #     df,
    #     content_tags1_tags2_df,
    #     how='left',
    #     left_on=['tags1', 'tags2'],
    #     right_index=True
    # )
    df = pd.concat(
        [df, content_prior_explanation_df.reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    df = pd.concat(
        [df, content_elapsed_time_df.reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    # df = pd.concat(
    #     [df, content_community_df.reindex(df['community']).reset_index(drop=True)],
    #     axis=1
    # )
    
    df['timestamp_day'] = np.round(df['timestamp']/(1000*60*60*24))
    df['timestamp'] = df['timestamp']/(1000*60) 
    df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean_Linh, inplace=True)
    df['prior_question_elapsed_time'] = df['prior_question_elapsed_time']/(1000*60)
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(False).astype(np.int8)
    
    df['answered_correctly_user_content_mean_hmean'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_mean'])
    df['answered_correctly_user_std'] = np.sqrt((df['lag1'] - df['answered_correctly_user_mean'])**2)
    df['answered_correctly_user_content_std_hmean'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_std'])
    
    df['answered_correctly_user_part_mean_hmean'] = (2 * df['answered_correctly_user_part_mean'] * df['answered_correctly_content_part_mean']) / (df['answered_correctly_user_part_mean'] + df['answered_correctly_content_part_mean'])
    # df['answered_correctly_user_part_std_hmean'] = (2 * df['answered_correctly_user_part_std'] * df['answered_correctly_content_part_std']) / (df['answered_correctly_user_part_std'] + df['answered_correctly_content_part_std'])
    # df['answered_correctly_user_part_mean_hmean2'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_part_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_part_mean'])
    # df['answered_correctly_user_part_std_hmean2'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_part_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_part_std'])
    
    df['answered_correctly_user_bundle_mean_hmean'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_bundle_count_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_bundle_count_mean'])
    df['answered_correctly_user_bundle_std_hmean'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_bundle_count_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_bundle_count_std'])

    df['lecture_user_percent'] = df['lecture_user_count'] / (df['answered_user_count'] + df['lecture_user_count'])
    df['lecture_user_percent'].fillna(0, inplace=True)

    df['prior_elapsed_time_mean'] = df['prior_elapsed_time_sum'] / df['answered_user_count']
    df['prior_elapsed_time_mean'].fillna(0, inplace=True)

    df['prior_explanation_mean'] = df['prior_explanation_sum'] / df['answered_user_count']
    df['prior_explanation_mean'].fillna(0, inplace=True)

    df['answered_correctly_rolling5_content_mean_hmean'] = (2 * df['rolling5_mean'] * df['answered_correctly_content_mean']) / (df['rolling5_mean'] + df['answered_correctly_content_mean'])
    df['answered_correctly_rolling5_content_std_hmean'] = (2 * df['rolling5_std'] * df['answered_correctly_content_std']) / (df['rolling5_std'] + df['answered_correctly_content_std'])
    df['answered_correctly_rolling15_content_mean_hmean'] = (2 * df['rolling15_mean'] * df['answered_correctly_content_mean']) / (df['rolling15_mean'] + df['answered_correctly_content_mean'])
    df['answered_correctly_rolling15_content_std_hmean'] = (2 * df['rolling15_std'] * df['answered_correctly_content_std']) / (df['rolling15_std'] + df['answered_correctly_content_std'])
    df['answered_correctly_rolling30_content_mean_hmean'] = (2 * df['rolling30_mean'] * df['answered_correctly_content_mean']) / (df['rolling30_mean'] + df['answered_correctly_content_mean'])
    df['answered_correctly_rolling30_content_std_hmean'] = (2 * df['rolling30_std'] * df['answered_correctly_content_std']) / (df['rolling30_std'] + df['answered_correctly_content_std'])
    
    df['answered_correctly_user_tags1_mean_hmean'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_tags1_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_tags1_mean'])
    df['answered_correctly_user_tags1_std_hmean'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_tags1_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_tags1_std'])
    # df['answered_correctly_user_tags1_tags2_mean_hmean'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_tags1_tags2_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_tags1_tags2_mean'])
    # df['answered_correctly_user_tags1_tags2_std_hmean'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_tags1_tags2_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_tags1_tags2_std'])
    
    df['content_prior_explanation_wrong_mean_diff1'] = df['answered_correctly_user_mean'] - df['content_prior_explanation_wrong_mean']
    df['content_prior_explanation_correct_mean_diff1'] = df['answered_correctly_user_mean'] - df['content_prior_explanation_correct_mean']
    # df['content_prior_explanation_wrong_mean_diff2'] = df['rolling5_mean'] - df['content_prior_explanation_wrong_mean']
    # df['content_prior_explanation_correct_mean_diff2'] = df['rolling5_mean'] - df['content_prior_explanation_correct_mean']
    df['content_prior_explanation_wrong_mean_diff3'] = df['rolling30_mean'] - df['content_prior_explanation_wrong_mean']
    df['content_prior_explanation_correct_mean_diff3'] = df['rolling30_mean'] - df['content_prior_explanation_correct_mean']
    
    df['prior_question_elapsed_time_wrong_mean_diff1'] = df['prior_elapsed_time_mean'] - df['prior_question_elapsed_time_wrong_mean']
    df['prior_question_elapsed_time_correct_mean_diff1'] = df['prior_elapsed_time_mean'] - df['prior_question_elapsed_time_correct_mean']
    df['question_elapsed_time_wrong_mean_diff1'] = df['prior_elapsed_time_mean'] - df['question_elapsed_time_wrong_mean']
    df['question_elapsed_time_correct_mean_diff1'] = df['prior_elapsed_time_mean'] - df['question_elapsed_time_correct_mean']
    df['prior_question_elapsed_time_wrong_mean_diff2'] = df['timestamp_diff'] - df['prior_question_elapsed_time_wrong_mean']
    df['prior_question_elapsed_time_correct_mean_diff2'] = df['timestamp_diff'] - df['prior_question_elapsed_time_correct_mean']
    df['question_elapsed_time_wrong_mean_diff2'] = df['timestamp_diff'] - df['question_elapsed_time_wrong_mean']
    df['question_elapsed_time_correct_mean_diff2'] = df['timestamp_diff'] - df['question_elapsed_time_correct_mean']
    
    # df['answered_correctly_user_community_mean_hmean'] = (2 * df['answered_correctly_user_mean'] * df['answered_correctly_content_community_mean']) / (df['answered_correctly_user_mean'] + df['answered_correctly_content_community_mean'])
    # df['answered_correctly_user_community_std_hmean'] = (2 * df['answered_correctly_user_std'] * df['answered_correctly_content_community_std']) / (df['answered_correctly_user_std'] + df['answered_correctly_content_community_std'])

    return df

## Load model

In [48]:
features_Linh = [
    'timestamp',
     'user_id',
     'content_id',
     'task_container_id',
     'prior_question_elapsed_time',
     'prior_question_had_explanation',
     'answered_user_count',
     'answered_correctly_user_sum',
     'answered_correctly_user_mean',
     'lag1',
     'lag2',
     'lag3',
     'lag4',
     'lag5',
     'rolling5_mean',
     'rolling5_std',
     'rolling15_mean',
     'rolling15_std',
     'rolling30_mean',
     'rolling30_std',
     'wrong_answered_user_count',
     'answered_user_part1_count',
     'answered_user_part2_count',
     'answered_user_part3_count',
     'answered_user_part4_count',
     'answered_user_part5_count',
     'answered_user_part6_count',
     'answered_user_part7_count',
     'answered_correctly_user_part1_sum',
     'answered_correctly_user_part2_sum',
     'answered_correctly_user_part3_sum',
     'answered_correctly_user_part4_sum',
     'answered_correctly_user_part5_sum',
     'answered_correctly_user_part6_sum',
     'answered_correctly_user_part7_sum',
     'answered_correctly_user_part_mean',
     'bundle_user_count_bins',
     'attempts',
     'lecture_user_count',
     'lecture_user_part1_count',
     'lecture_user_part2_count',
     'lecture_user_part3_count',
     'lecture_user_part4_count',
     'lecture_user_part5_count',
     'lecture_user_part6_count',
     'lecture_user_part7_count',
     'lecture_user_concept_count',
     'lecture_user_solving_question_count',
     'lecture_user_intention_count',
     'prior_elapsed_time_sum',
     'prior_elapsed_time_diff',
     'prior_explanation_sum',
     'task_count',
     'task_content_number',
     'timestamp_lag1',
     'timestamp_lag2',
     'timestamp_lag3',
     'timestamp_lag4',
     'timestamp_lag5',
     'timestamp_diff',
     'timestamp_diff2',
     'timestamp_diff3',
     'timestamp_diff4',
     'timestamp_diff5',
     'bundle_id',
     'part',
     'tags_number',
     'tags1',
     'tags2',
     'tags3',
     'tags4',
     'community',
     'answered_correctly_content_mean',
     'answered_correctly_content_std',
     'answered_correctly_content_part_mean',
     'answered_correctly_content_part_std',
     'answered_correctly_content_bundle_count_mean',
     'answered_correctly_content_bundle_count_std',
     'answered_correctly_content_tags1_mean',
     'answered_correctly_content_tags1_std',
     'content_prior_explanation_wrong_mean',
     'content_prior_explanation_correct_mean',
     'prior_question_elapsed_time_wrong_mean',
     'prior_question_elapsed_time_correct_mean',
     'question_elapsed_time_wrong_mean',
     'question_elapsed_time_correct_mean',
     'timestamp_day',
     'answered_correctly_user_content_mean_hmean',
     'answered_correctly_user_std',
     'answered_correctly_user_content_std_hmean',
     'answered_correctly_user_part_mean_hmean',
     'answered_correctly_user_bundle_mean_hmean',
     'answered_correctly_user_bundle_std_hmean',
     'lecture_user_percent',
     'prior_elapsed_time_mean',
     'prior_explanation_mean',
     'answered_correctly_rolling5_content_mean_hmean',
     'answered_correctly_rolling5_content_std_hmean',
     'answered_correctly_rolling15_content_mean_hmean',
     'answered_correctly_rolling15_content_std_hmean',
     'answered_correctly_rolling30_content_mean_hmean',
     'answered_correctly_rolling30_content_std_hmean',
     'answered_correctly_user_tags1_mean_hmean',
     'answered_correctly_user_tags1_std_hmean',
     'content_prior_explanation_wrong_mean_diff1',
     'content_prior_explanation_correct_mean_diff1',
     'content_prior_explanation_wrong_mean_diff3',
     'content_prior_explanation_correct_mean_diff3',
     'prior_question_elapsed_time_wrong_mean_diff1',
     'prior_question_elapsed_time_correct_mean_diff1',
     'question_elapsed_time_wrong_mean_diff1',
     'question_elapsed_time_correct_mean_diff1',
     'prior_question_elapsed_time_wrong_mean_diff2',
     'prior_question_elapsed_time_correct_mean_diff2',
     'question_elapsed_time_wrong_mean_diff2',
     'question_elapsed_time_correct_mean_diff2'
]

target = 'answered_correctly'

In [49]:
len(features_Linh)

116

In [50]:
model_Linh = lgb.Booster(model_file='../input/riiid-models/20201228_model_90M.txt')

# III. Inference

In [51]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None
previous_test_df = None

In [52]:
%%time
for (test_df, sample_prediction_df) in iter_test:

    # SON'S UPDATE
    # update du lieu sau khi co label moi
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        
        prior_test_df['prior_question_had_explanation'] = prior_test_df['prior_question_had_explanation'].fillna(False).astype('bool')
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        timestamps = prior_test_df['timestamp'].values
        explanations = prior_test_df['prior_question_had_explanation'].values
        elapsed_times = prior_test_df['prior_question_elapsed_time'].values
        task_containers = prior_test_df['task_container_id'].values
        
        for user_id, content_id, answered_correctly,timestamp,explanation,elapsed_time,task_container_id in zip(user_ids, content_ids, targets,timestamps,explanations,elapsed_times, task_containers):
            # neu la lecture thi update cac dict lecture
            if answered_correctly == -1:

                part_id = lectures_part_dict[content_id]
                
                # user_lecture_part_count
                user_lecture_part_count_dict[(user_id,part_id)] += 1

                # user_lecture_count
                user_lecture_count_dict[user_id] += 1
                
                # user_solving_count', 'user_concept_count'
                if lectures_type_dict[content_id] == 'solving question':
                    user_solving_count_dict[user_id] += 1
                elif lectures_type_dict[content_id] == 'concept':
                    user_concept_count_dict[user_id] += 1

                # timelag_update
                time_lag_max_dict[user_id] = timestamp

                # timelag_2 -> timelag_6 dict
                if user_id in timestamp_user_dict:
                    if timestamp not in timestamp_user_dict[user_id]:
                        timestamp_user_dict[user_id].append(timestamp)
                else:
                    timestamp_user_dict[user_id].append(timestamp)
                
                if len(timestamp_user_dict[user_id]) > 6:
                    timestamp_user_dict[user_id].pop(0)
            
            # neu ko fai lecture thi update cac loop dict
            else:
                part_id = questions_part_dict[content_id]
                
                # 'user_correctness', 'user_count','user_sum'
                user_sum_dict[user_id] += answered_correctly
                user_count_dict[user_id] += 1
                
                # 'content_correctness', 'content_count'
                # 'content_std' ko update
                content_sum_dict[content_id] += answered_correctly
                content_count_dict[content_id] += 1
            
                # 'user_part_correctness', 'user_part_count'
                user_part_sum_dict[(user_id,part_id)] += answered_correctly
                user_part_count_dict[(user_id,part_id)] += 1
                
                # rolling features
                user_rolling_sum_dict[user_id].append(answered_correctly)
                if len(user_rolling_sum_dict[user_id]) > 30:
                    user_rolling_sum_dict[user_id].pop(0)
                    
                # 'explanation_sum', 'explanation_count', 'user_explanation_correctness'    
                explanation_agg_sum_dict[(user_id,explanation)] += answered_correctly
                explanation_agg_count_dict[(user_id,explanation)] += 1
                
                # "user_explanation_sum", "user_explanation_count", user_explanation_mean",
                user_explanation_sum_dict[user_id] += explanation
                user_explanation_count_dict[user_id] += 1
                
                # update user_question_part by attempt dict:
                if user_id in users_dict:
                    attempt_binary_id = users_dict[user_id][content_id]
                else:
                    attempt_binary_id = 0
                
                user_attempt_sum_dict[(user_id,attempt_binary_id)] += answered_correctly
                user_attempt_count_dict[(user_id,attempt_binary_id)] += 1

                content_attempt_sum_dict[(content_id,attempt_binary_id)] += answered_correctly
                content_attempt_count_dict[(content_id,attempt_binary_id)] += 1

                part_attempt_sum_dict[(part_id,attempt_binary_id)]+= answered_correctly
                part_attempt_count_dict[(part_id,attempt_binary_id)]+=1
                
                # attempt_binary
                # step nay fai sau step "update user_question_part by attempt dict"
                # neu ko attempt_binary_id o step "update user_question_part by attempt dict" se sai
                if user_id in users_dict:
                    users_dict[user_id][content_id] = 1
                else:
                    a = bitarray(13550, endian='little')
                    a.setall(False)
                    a[content_id] = 1
                    users_dict[user_id] = a
                
                # skip_task
                last_task_dict[user_id] = task_container_id

                # timelag_update
                time_lag_max_dict[user_id] = timestamp

                # timelag_2 -> timelag_6 dict
                if user_id in timestamp_user_dict:
                    if timestamp not in timestamp_user_dict[user_id]:
                        timestamp_user_dict[user_id].append(timestamp)
                else:
                    timestamp_user_dict[user_id].append(timestamp)
                
                if len(timestamp_user_dict[user_id]) > 6:
                    timestamp_user_dict[user_id].pop(0)

                # elapsed_time_user_mean, elapsed_time_user_diff
                curr_bundle = question_bundle[content_id]
                if user_id in last_bundle_dict:
                    last_bundle = last_bundle_dict[user_id]

                    if curr_bundle == last_bundle:
                        if bundle_count[curr_bundle] == 1:
                            last_result_dict[user_id] = [answered_correctly]
                        else:
                            last_result_dict[user_id].append(answered_correctly)
                    else:
                        last_bundle_dict[user_id] = curr_bundle
                        last_result_dict[user_id] = [answered_correctly]
                else:
                    last_bundle_dict[user_id] = curr_bundle
                    last_result_dict[user_id] = [answered_correctly]

                # timelag_part
                if timestamp not in timestamp_part_user_dict[(user_id,part_id)]:
                    timestamp_part_user_dict[(user_id,part_id)].append(timestamp)
                
                if len(timestamp_part_user_dict[(user_id,part_id)]) > 3:
                    timestamp_part_user_dict[(user_id,part_id)].pop(0)

                # lag_part
                user_part_lag_dict[(user_id,part_id)].append(answered_correctly)
                if len(user_part_lag_dict[(user_id,part_id)]) > 3:
                    user_part_lag_dict[(user_id,part_id)].pop(0)

                # user_bundle_count & user_bundle_question_ratio
                curr_bundle_id = question_bundle[content_id]
                if user_id in user_last_iteraction:
                    last_timestamp = user_last_iteraction[user_id][0]
                    last_task_container = user_last_iteraction[user_id][1]
                    last_bundle_id = user_last_iteraction[user_id][2]
                    
                    if timestamp == last_timestamp and task_container_id == last_task_container and last_bundle_id == curr_bundle_id:
                        continue 
                    else:
                        user_bundle_count_dict[user_id] += 1
                        user_last_iteraction[user_id] = [timestamp,task_container_id,curr_bundle_id]
                else:
                    user_bundle_count_dict[user_id] += 1
                    user_last_iteraction[user_id] = [timestamp,task_container_id,curr_bundle_id]
                
    # save data vao prior truoc khi process
    prior_test_df = test_df.copy()
    
    
    # LINH'S UPDATE
    if previous_test_df is not None:
        previous_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        previous_test_df['user_answer'] = eval(test_df['prior_group_responses'].iloc[0])

        # your feature extraction and model training code here
        update_user_features(previous_test_df)

    previous_test_df = test_df.copy()
    test2_df = test_df.copy()
    
    # SON'S ADDING FEATURES
    # count so lan xuat hien cua user trong test_df
    # dung de tinh timelag, vi timestamp la thoi diem user hoan thanh action
    # trong truong hop 1 task gom nhieu cau, thi timelag = (curr_time - last_time)/count_task
    user_task_container_count_dict = defaultdict(int)
    for i, user_id in enumerate(test_df['user_id'].values):
        user_task_container_count_dict[user_id] += 1
    
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    # general feature, cac feature nay k thay doi
    test_df['ts_bins'] = ts_quantile.transform(test_df[['timestamp']])
    test_df['task_bins'] = task_quantile.transform(test_df[['task_container_id']])

    # merge vs questions_df agg
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_index = True, how='left')
    test_df['question_id'] = test_df['content_id']
    test_df['content_std'] = test_df['question_id'].map(content_agg['std'])
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    
    # general stats, ko update
    test_df['question_time_mean_false'] = test_df['question_id'].map(question_time['question_time_mean_false'])
    test_df['question_time_mean_true'] = test_df['question_id'].map(question_time['question_time_mean_true'])
    test_df['question_time_median_false'] = test_df['question_id'].map(question_time['question_time_median_false'])
    test_df['question_time_median_true'] = test_df['question_id'].map(question_time['question_time_median_true'])

    test_df['part_time_mean_false'] = test_df['part'].map(part_time['part_time_mean_false'])
    test_df['part_time_mean_true'] = test_df['part'].map(part_time['part_time_mean_true'])
    test_df['part_time_median_false'] = test_df['part'].map(part_time['part_time_median_false'])
    test_df['part_time_median_true'] = test_df['part'].map(part_time['part_time_median_true'])
    
    # update elapsed_time_user_sum_dict  & elapsed_time_user_count_dict trước
    user_elapsed_updated = []
    for i, row in enumerate(test_df[['user_id','prior_question_elapsed_time']].values):
        if row[0] not in user_elapsed_updated:
            if row[0] in last_bundle_dict:
                last_result = last_result_dict[row[0]]
                for _ in last_result:
                    elapsed_time_user_sum_dict[(row[0],_)] += row[1]
                    elapsed_time_user_count_dict[(row[0],_)] += 1
                user_elapsed_updated.append(row[0])

    # khai bao cac loop array
    user_sum = np.zeros(len(test_df), dtype=np.int32)
    user_count = np.zeros(len(test_df), dtype=np.int32)
    
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    
    user_part_sum = np.zeros(len(test_df), dtype=np.int32)
    user_part_count = np.zeros(len(test_df), dtype=np.int32)
    
    user_lecture_part_count_array = np.zeros(len(test_df), dtype=np.int32)
    user_lecture_count_array = np.zeros(len(test_df), dtype=np.int32)
    user_solving_count_array = np.zeros(len(test_df), dtype=np.int32)
    user_concept_count_array = np.zeros(len(test_df), dtype=np.int32)
    
    user_roll15_mean_array = np.zeros(len(test_df), dtype=np.float32)
    user_roll30_mean_array = np.zeros(len(test_df), dtype=np.float32)
    
    user_roll15_std_array = np.zeros(len(test_df), dtype=np.float32)
    user_roll30_std_array = np.zeros(len(test_df), dtype=np.float32)

    user_roll15_count_array = np.zeros(len(test_df), dtype=np.int32)
    user_roll30_count_array = np.zeros(len(test_df), dtype=np.int32)
    
    lag_1 = np.zeros(len(test_df), dtype=np.float32)
    lag_2 = np.zeros(len(test_df), dtype=np.float32)
    lag_3 = np.zeros(len(test_df), dtype=np.float32)
            
    # attempt
    attempt_array = np.zeros(len(test_df), dtype=np.int8)
    
    # woe
    woe_content_array = np.zeros(len(test_df), dtype=np.float32)
    woe_part_array = np.zeros(len(test_df), dtype=np.float32)

    # skip_task
    skip_task_array = np.zeros(len(test_df), dtype=np.int8)

    # timelag_update
    timestamp_array = np.zeros(len(test_df), dtype=np.float64)
    # timelag_2 -> timelag_6
    timelag_2 = np.zeros(len(test_df), dtype = np.float32)
    timelag_3 = np.zeros(len(test_df), dtype = np.float32)
    timelag_4 = np.zeros(len(test_df), dtype = np.float32)
    timelag_5 = np.zeros(len(test_df), dtype = np.float32)
    timelag_6 = np.zeros(len(test_df), dtype = np.float32)

    # is_bundle
    is_bundle_array = np.zeros(len(test_df), dtype = np.int8)

    # content_part - bin
    content_bin_mean_array = np.zeros(len(test_df), dtype=np.float32)
    content_bin_count_array = np.zeros(len(test_df), dtype=np.int32)
    content_bin_std_array = np.zeros(len(test_df), dtype=np.float32)

    part_bin_mean_array = np.zeros(len(test_df), dtype=np.float32)
    part_bin_count_array = np.zeros(len(test_df), dtype=np.int64)
    part_bin_std_array = np.zeros(len(test_df), dtype=np.float32)

    # elapsed time
    elapsed_time_user_sum_true_array = np.zeros(len(test_df), dtype=np.float64)
    elapsed_time_user_count_true_array = np.zeros(len(test_df), dtype=np.float32)
    elapsed_time_user_sum_false_array = np.zeros(len(test_df), dtype=np.float64)
    elapsed_time_user_count_false_array = np.zeros(len(test_df), dtype=np.float32)

    question_timelag_median_array = np.zeros(len(test_df), dtype=np.float64)

    explanation_sum_array = np.zeros(len(test_df), dtype=np.float32)
    explanation_count_array = np.zeros(len(test_df), dtype=np.float32)
    
    # user_content_part - attempt    
    user_attempt_sum_array = np.zeros(len(test_df), dtype=np.int32)
    user_attempt_count_array = np.zeros(len(test_df), dtype=np.int32)

    content_attempt_sum_array = np.zeros(len(test_df), dtype=np.int64)
    content_attempt_count_array = np.zeros(len(test_df), dtype=np.int64)

    part_attempt_sum_array = np.zeros(len(test_df), dtype=np.int64)
    part_attempt_count_array = np.zeros(len(test_df), dtype=np.int64)

    user_explanation_sum_array = np.zeros(len(test_df), dtype=np.float32)
    user_explanation_count_array = np.zeros(len(test_df), dtype=np.float32)

    # timelag_part
    timelag_part_1 = np.zeros(len(test_df), dtype = np.float32)
    timelag_part_2 = np.zeros(len(test_df), dtype = np.float32)
    timelag_part_3 = np.zeros(len(test_df), dtype = np.float32)

    # lag_part
    lag_part_1 = np.zeros(len(test_df), dtype=np.float32)
    lag_part_2 = np.zeros(len(test_df), dtype=np.float32)
    lag_part_3 = np.zeros(len(test_df), dtype=np.float32)

    lag_part_mean = np.zeros(len(test_df), dtype=np.float32)

    # user_bundle_count & user_bundle_question_ratio
    user_bundle_count = np.zeros(len(test_df), dtype=np.int32)

    # session feat
    user_session = np.zeros(len(test_df), dtype=np.int32)
    user_time_accumulate = np.zeros(len(test_df), dtype=np.float32)
    
    for i, (user_id, content_id,explanation,task_container_id,elapsed_time,timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values, \
                                                                                test_df['prior_question_had_explanation'].values, test_df['task_container_id'].values, \
                                                                                test_df['prior_question_elapsed_time'].values, \
                                                                                test_df['timestamp'].values)):
        part_id = questions_part_dict[content_id]

        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]

        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        
        user_part_sum[i] = user_part_sum_dict[(user_id,part_id)]
        user_part_count[i] = user_part_count_dict[(user_id,part_id)]
        
        user_lecture_part_count_array[i] = user_lecture_part_count_dict[(user_id,part_id)]
        user_solving_count_array[i] = user_solving_count_dict[user_id]
        user_concept_count_array[i] = user_concept_count_dict[user_id]
        user_lecture_count_array[i] = user_lecture_count_dict[user_id]
        
        user_roll15_mean_array[i] = np.mean(user_rolling_sum_dict[user_id][-15:])
        user_roll30_mean_array[i] = np.mean(user_rolling_sum_dict[user_id][-30:])
        user_roll15_std_array[i] = np.std(user_rolling_sum_dict[user_id][-15:])
        user_roll30_std_array[i] = np.std(user_rolling_sum_dict[user_id][-30:])
        user_roll15_count_array[i] = len(user_rolling_sum_dict[user_id][-15:])
        user_roll30_count_array[i] = len(user_rolling_sum_dict[user_id][-30:])
        
        if user_id in user_rolling_sum_dict:
            if len(user_rolling_sum_dict[user_id]) >= 3:
                lag_1[i] = user_rolling_sum_dict[user_id][-1]
                lag_2[i] = user_rolling_sum_dict[user_id][-2]
                lag_3[i] = user_rolling_sum_dict[user_id][-3]
            elif len(user_rolling_sum_dict[user_id]) == 2:
                lag_1[i] = user_rolling_sum_dict[user_id][-1]
                lag_2[i] = user_rolling_sum_dict[user_id][-2]
                lag_3[i] = np.nan
            elif len(user_rolling_sum_dict[user_id]) == 1:
                lag_1[i] = user_rolling_sum_dict[user_id][-1]
                lag_2[i] = np.nan
                lag_3[i] = np.nan
            elif len(user_rolling_sum_dict[user_id]) == 0:
                lag_1[i] = np.nan
                lag_2[i] = np.nan
                lag_3[i] = np.nan
        else:
            lag_1[i] = np.nan
            lag_2[i] = np.nan
            lag_3[i] = np.nan
        
        if user_id in users_dict:
            if users_dict[user_id][content_id] == 1:
                attempt_array[i] = 1
            else:
                attempt_array[i] = 0
        else:
            attempt_array[i] = 0
        
        # woe
        woe_content_array[i] = woe_content_dict[content_id]
        woe_part_array[i] = woe_part_dict[part_id]

        # skip_task
        skip_task_array[i] = last_task_dict[user_id] > task_container_id

        # timelag_update
        timestamp_array[i] = (timestamp - time_lag_max_dict[user_id])/user_task_container_count_dict[user_id]
        # timelag_2 -> timelag_6    
        if user_id in timestamp_user_dict:
            if len(timestamp_user_dict[user_id]) >= 6:
                timelag_2[i] = (timestamp - timestamp_user_dict[user_id][4])/user_task_container_count_dict[user_id]
                timelag_3[i] = (timestamp - timestamp_user_dict[user_id][3])/user_task_container_count_dict[user_id]
                timelag_4[i] = (timestamp - timestamp_user_dict[user_id][2])/user_task_container_count_dict[user_id]
                timelag_5[i] = (timestamp - timestamp_user_dict[user_id][1])/user_task_container_count_dict[user_id]
                timelag_6[i] = (timestamp - timestamp_user_dict[user_id][0])/user_task_container_count_dict[user_id]
            elif len(timestamp_user_dict[user_id]) == 5:
                timelag_2[i] = (timestamp - timestamp_user_dict[user_id][3])/user_task_container_count_dict[user_id]
                timelag_3[i] = (timestamp - timestamp_user_dict[user_id][2])/user_task_container_count_dict[user_id]
                timelag_4[i] = (timestamp - timestamp_user_dict[user_id][1])/user_task_container_count_dict[user_id]
                timelag_5[i] = (timestamp - timestamp_user_dict[user_id][0])/user_task_container_count_dict[user_id]
                timelag_6[i] = np.nan
            elif len(timestamp_user_dict[user_id]) == 4:
                timelag_2[i] = (timestamp - timestamp_user_dict[user_id][2])/user_task_container_count_dict[user_id]
                timelag_3[i] = (timestamp - timestamp_user_dict[user_id][1])/user_task_container_count_dict[user_id]
                timelag_4[i] = (timestamp - timestamp_user_dict[user_id][0])/user_task_container_count_dict[user_id]
                timelag_5[i] = np.nan
                timelag_6[i] = np.nan
            elif len(timestamp_user_dict[user_id]) == 3:
                timelag_2[i] = (timestamp - timestamp_user_dict[user_id][1])/user_task_container_count_dict[user_id]
                timelag_3[i] = (timestamp - timestamp_user_dict[user_id][0])/user_task_container_count_dict[user_id]
                timelag_4[i] = np.nan
                timelag_5[i] = np.nan
                timelag_6[i] = np.nan
            elif len(timestamp_user_dict[user_id]) == 2:
                timelag_2[i] = (timestamp - timestamp_user_dict[user_id][0])/user_task_container_count_dict[user_id]
                timelag_3[i] = np.nan
                timelag_4[i] = np.nan
                timelag_5[i] = np.nan
                timelag_6[i] = np.nan

            elif len(timestamp_user_dict[user_id]) <= 1:
                timelag_2[i] = np.nan
                timelag_3[i] = np.nan
                timelag_4[i] = np.nan
                timelag_5[i] = np.nan
                timelag_6[i] = np.nan
        else:
            timelag_2[i] = np.nan
            timelag_3[i] = np.nan
            timelag_4[i] = np.nan
            timelag_5[i] = np.nan
            timelag_6[i] = np.nan

        # is_bundle
        is_bundle_array[i] = is_bundle_dict[content_id]

        # convert task_container to cut
        # convert_task_bin = task_bin_dict[task_container_id]
        convert_task_bin = int(task_container_id <= 30)
        content_bin_mean_array[i] = content_bin_mean_dict[(content_id,convert_task_bin)]
        content_bin_count_array[i] = content_bin_count_dict[(content_id,convert_task_bin)]
        content_bin_std_array[i] = content_bin_std_dict[(content_id,convert_task_bin)]

        part_bin_mean_array[i] = part_bin_mean_dict[(part_id,convert_task_bin)]
        part_bin_count_array[i] = part_bin_count_dict[(part_id,convert_task_bin)]
        part_bin_std_array[i] = part_bin_std_dict[(part_id,convert_task_bin)]

        # elapsed_time_user
        elapsed_time_user_sum_true_array[i] = elapsed_time_user_sum_dict[(user_id,1)]
        elapsed_time_user_count_true_array[i] = elapsed_time_user_count_dict[(user_id,1)]
        elapsed_time_user_sum_false_array[i] = elapsed_time_user_sum_dict[(user_id,0)]
        elapsed_time_user_count_false_array[i] = elapsed_time_user_count_dict[(user_id,0)]

        question_timelag_median_array[i] = question_timelag_dict[content_id]

        explanation_sum_array[i] = explanation_agg_sum_dict[(user_id,explanation)]
        explanation_count_array[i] = explanation_agg_count_dict[(user_id,explanation)]

        # user_content_part attempt
        attempt_binary_id = attempt_array[i]
        user_attempt_sum_array[i] = user_attempt_sum_dict[(user_id,attempt_binary_id)]
        user_attempt_count_array[i] = user_attempt_count_dict[(user_id,attempt_binary_id)]

        content_attempt_sum_array[i] = content_attempt_sum_dict[(content_id,attempt_binary_id)]
        content_attempt_count_array[i] = content_attempt_count_dict[(content_id,attempt_binary_id)]

        part_attempt_sum_array[i] = part_attempt_sum_dict[(part_id,attempt_binary_id)]
        part_attempt_count_array[i] = part_attempt_count_dict[(part_id,attempt_binary_id)]

        user_explanation_sum_array[i] = user_explanation_sum_dict[user_id]
        user_explanation_count_array[i] = user_explanation_count_dict[user_id]

        # timelag_part
        timelag_part = timestamp_part_user_dict[(user_id, part_id)]
        if len(timelag_part) >= 3:
            timelag_part_1[i] = (timestamp - timelag_part[-1])/user_task_container_count_dict[user_id]
            timelag_part_2[i] = (timestamp - timelag_part[-2])/user_task_container_count_dict[user_id]
            timelag_part_3[i] = (timestamp - timelag_part[-3])/user_task_container_count_dict[user_id]
        elif len(timelag_part) == 2:
            timelag_part_1[i] = (timestamp - timelag_part[-1])/user_task_container_count_dict[user_id]
            timelag_part_2[i] = (timestamp - timelag_part[-2])/user_task_container_count_dict[user_id]
            timelag_part_3[i] = np.nan
        elif len(timelag_part) == 1:
            timelag_part_1[i] = (timestamp - timelag_part[-1])/user_task_container_count_dict[user_id]
            timelag_part_2[i] = np.nan
            timelag_part_3[i] = np.nan
        elif len(timelag_part) == 0:
            timelag_part_1[i] = np.nan
            timelag_part_2[i] = np.nan
            timelag_part_3[i] = np.nan

        # lag_part
        lag_part = user_part_lag_dict[(user_id,part_id)]
        if len(lag_part) >= 3:
            lag_part_1[i] = lag_part[-1]
            lag_part_2[i] = lag_part[-2]
            lag_part_3[i] = lag_part[-3]
        elif len(lag_part) == 2:
            lag_part_1[i] = lag_part[-1]
            lag_part_2[i] = lag_part[-2]
            lag_part_3[i] = np.nan
        elif len(lag_part) == 1:
            lag_part_1[i] = lag_part[-1]
            lag_part_2[i] = np.nan
            lag_part_3[i] = np.nan
        elif len(lag_part) == 0:
            lag_part_1[i] = np.nan
            lag_part_2[i] = np.nan
            lag_part_3[i] = np.nan
        
        lag_part_mean[i] = np.mean(lag_part)
        # user_bundle_count & user_bundle_question_ratio
        user_bundle_count[i] = user_bundle_count_dict[user_id]

        # user_session, user_time_accumulate
        if timestamp_array[i]/1000 > 60*60:
            last_session_dict[user_id] += 1
            user_session[i] = last_session_dict[user_id]
        else:
            user_session[i] = last_session_dict[user_id]

        if timestamp_array[i]/1000 > 5*60: # 1 cau hoi toi da 5 mins
            user_time_accumulate_dict[user_id] += 5*60
        elif timestamp_array[i]/1000 >= 0:
            user_time_accumulate_dict[user_id] += timestamp_array[i]/1000
        else:
            user_time_accumulate_dict[user_id] = 0
        user_time_accumulate[i] = user_time_accumulate_dict[user_id]
    
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['user_correctness'] = user_sum / user_count
    test_df['user_part_correctness'] = user_part_sum / user_part_count
    
    test_df['user_count'] = user_count
    test_df['user_sum'] = user_sum
    test_df['user_part_count'] = user_part_count
    test_df['user_part_sum'] = user_part_sum

    test_df['user_lecture_part_count'] = user_lecture_part_count_array
    test_df['user_solving_count'] = user_solving_count_array
    test_df['user_concept_count'] = user_concept_count_array
    test_df['user_lecture_count'] = user_lecture_count_array
    
    # rolling
    test_df['user_roll15_correctness'] = user_roll15_mean_array
    test_df['user_roll15_std'] =user_roll15_std_array
    test_df['user_roll15_count'] =user_roll15_count_array

    test_df['user_roll30_correctness'] =user_roll30_mean_array
    test_df['user_roll30_std'] =user_roll30_std_array
    test_df['user_roll30_count'] =user_roll30_count_array

    test_df['user_roll15_ircorrect'] = 1 - test_df['user_roll15_correctness']
    test_df['user_roll30_ircorrect'] = 1 - test_df['user_roll30_correctness']
    test_df['correct_ircorrect_roll15_ratio'] = test_df['user_roll15_correctness'] / test_df['user_roll15_ircorrect']
    test_df['correct_ircorrect_roll30_ratio'] = test_df['user_roll30_correctness'] / test_df['user_roll30_ircorrect']
    
    test_df['lag_1'] = lag_1
    test_df['lag_2'] = lag_2
    test_df['lag_3'] = lag_3

    # attempt
    test_df['attempt_binary'] = attempt_array
    
    # woe
    test_df['WOE_content_id'] = woe_content_array
    test_df['WOE_part'] = woe_part_array

    # skip_task
    test_df['skip_task'] = skip_task_array

    # timelag_update timelag_2 -> timelag_6
    test_df['timelag_update'] = timestamp_array
    test_df['timelag_2'] = timelag_2
    test_df['timelag_3'] = timelag_3
    test_df['timelag_4'] = timelag_4
    test_df['timelag_5'] = timelag_5
    test_df['timelag_6'] = timelag_6

    # is_bundle
    test_df['is_bundle'] = is_bundle_array

    # content_part - bin general
    test_df['content_bin_mean'] = content_bin_mean_array
    test_df['content_bin_count'] = content_bin_count_array
    test_df['content_bin_std'] = content_bin_std_array
    test_df['part_bin_mean'] = part_bin_mean_array
    test_df['part_bin_count'] = part_bin_count_array
    test_df['part_bin_std'] = part_bin_std_array
    
    # elapsed_time user loop
    elapsed_time_user_mean_true_array = elapsed_time_user_sum_true_array / elapsed_time_user_count_true_array
    elapsed_time_user_mean_false_array = elapsed_time_user_sum_false_array / elapsed_time_user_count_false_array
    elapsed_time_user_mean_array = (elapsed_time_user_sum_true_array+elapsed_time_user_sum_false_array) / (elapsed_time_user_count_true_array+elapsed_time_user_count_false_array)

    test_df['elapsed_time_user_mean_true'] = elapsed_time_user_mean_true_array
    test_df['elapsed_time_user_mean_false'] = elapsed_time_user_mean_false_array
    test_df['elapsed_time_user_mean'] = elapsed_time_user_mean_array
    test_df['elapsed_time_user_diff_true'] = test_df['timelag_update'] - test_df['elapsed_time_user_mean_true']
    test_df['elapsed_time_user_diff_false'] = test_df['timelag_update'] - test_df['elapsed_time_user_mean_false']
    test_df['elapsed_time_user_diff'] = test_df['timelag_update'] - test_df['elapsed_time_user_mean']

    # question_timelag_median, question_timelag_diff
    test_df['question_timelag_median'] = question_timelag_median_array
    test_df['question_timelag_diff'] = test_df['timelag_update'] - test_df['question_timelag_median']

    test_df['explanation_sum'] = explanation_sum_array
    test_df['explanation_count'] = explanation_count_array
    test_df['user_explanation_correctness'] = test_df['explanation_sum'] / test_df['explanation_count']

#     test_df['elapsed_time_user_mean'] = elapsed_time_user_sum_array / elapsed_time_user_count_array
#     test_df['elapsed_time_question_mean'] = elapsed_time_question_sum_array / elapsed_time_question_count_array
#     test_df['elapsed_time_user_diff'] = test_df['prior_question_elapsed_time'] - test_df['elapsed_time_user_mean']
#     test_df['elapsed_time_question_diff'] = test_df['prior_question_elapsed_time'] - test_df['elapsed_time_question_mean']

    test_df['question_time_mean_false_diff'] = test_df['timelag_update'] - test_df['question_time_mean_false']
    test_df['question_time_mean_true_diff'] = test_df['timelag_update'] - test_df['question_time_mean_true']
    test_df['question_time_median_false_diff'] = test_df['timelag_update'] - test_df['question_time_median_false']
    test_df['question_time_median_true_diff'] = test_df['timelag_update'] - test_df['question_time_median_true']

    test_df['part_time_mean_false_diff'] = test_df['timelag_update'] - test_df['part_time_mean_false']
    test_df['part_time_mean_true_diff'] = test_df['timelag_update'] - test_df['part_time_mean_true']
    test_df['part_time_median_false_diff'] = test_df['timelag_update'] - test_df['part_time_median_false']
    test_df['part_time_median_true_diff'] = test_df['timelag_update'] - test_df['part_time_median_true']
    
    # user_content_part - attempt 
    test_df['user_attempt_sum']=user_attempt_sum_array
    test_df['user_attempt_count']=user_attempt_count_array
    test_df['content_attempt_sum']=content_attempt_sum_array
    test_df['content_attempt_count']=content_attempt_count_array
    test_df['part_attempt_sum']=part_attempt_sum_array
    test_df['part_attempt_count']=part_attempt_count_array
    test_df['user_attempt_mean'] = user_attempt_sum_array / user_attempt_count_array
    test_df['content_attempt_mean'] = content_attempt_sum_array / content_attempt_count_array
    test_df['part_attempt_mean'] = part_attempt_sum_array / part_attempt_count_array
    
    '''test_df['user_explanation_sum'] = user_explanation_sum_array
    test_df['user_explanation_count'] = user_explanation_count_array'''
    test_df['user_explanation_mean'] = user_explanation_sum_array / user_explanation_count_array

    # timelag_part
    test_df['timelag_part_1'] = timelag_part_1
    test_df['timelag_part_2'] = timelag_part_2
    test_df['timelag_part_3'] = timelag_part_3

    # lag_part
    test_df['lag_part_1'] = lag_part_1
    test_df['lag_part_2'] = lag_part_2
    test_df['lag_part_3'] = lag_part_3
    test_df['lag_part_mean'] = lag_part_mean

    # user_bundle_count & user_bundle_question_ratio
    test_df['user_bundle_count'] = user_bundle_count
    test_df['user_bundle_question_ratio'] = test_df['user_bundle_count'] / test_df['user_count']

    # session feats
    test_df['user_session'] = user_session
    test_df['user_time_accumulate'] = user_time_accumulate

    # FILLNA
    test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean, inplace = True)
    test_df['user_correctness'].fillna(0.65, inplace = True)
    test_df.timelag_update.fillna(-999, inplace = True)
    test_df['question_time_mean_false_diff'].fillna(-999, inplace = True)
    test_df['question_time_mean_true_diff'].fillna(-999, inplace = True)
    test_df['question_time_median_false_diff'].fillna(-999, inplace = True)
    test_df['question_time_median_true_diff'].fillna(-999, inplace = True)

    test_df['part_time_mean_false_diff'].fillna(-999, inplace = True)
    test_df['part_time_mean_true_diff'].fillna(-999, inplace = True)
    test_df['part_time_median_false_diff'].fillna(-999, inplace = True)
    test_df['part_time_median_true_diff'].fillna(-999, inplace = True)

    test_df['elapsed_time_user_diff_true'].fillna(-999, inplace = True)
    test_df['elapsed_time_user_diff_false'].fillna(-999, inplace = True)
    
    test_df.lag_1.fillna(-999, inplace = True)
    test_df.lag_2.fillna(-999, inplace = True)
    test_df.lag_3.fillna(-999, inplace = True)

    # HMEAN
    test_df['hmean_user_content_accuracy'] = 2 * (
        (test_df['user_correctness'] * test_df['content_id']) /
        (test_df['user_correctness'] + test_df['content_id'])
    )
    
    test_df['hmean_user_part_accuracy'] = 2 * (
        (test_df['user_correctness'] * test_df['part_correct']) /
        (test_df['user_correctness'] + test_df['part_correct'])
    )
    
    test_df['hmean_user_content_accuracy_roll15'] = 2 * (
        (test_df['user_roll15_correctness'] * test_df['content_id']) /
        (test_df['user_roll15_correctness'] + test_df['content_id'])
    )
    
#     test_df['hmean_user_content_accuracy_roll30'] = 2 * (
#         (test_df['user_roll15_correctness'] * test_df['content_id']) /
#         (test_df['user_roll15_correctness'] + test_df['content_id'])
#     )
    
    test_df['hmean_user_content_bin_accuracy'] = 2 * (
        (test_df['user_correctness'] * test_df['content_bin_mean']) /
        (test_df['user_correctness'] + test_df['content_bin_mean'])
    )
    
    test_df['hmean_user_part_bin_accuracy'] = 2 * (
        (test_df['user_correctness'] * test_df['part_bin_mean']) /
        (test_df['user_correctness'] + test_df['part_bin_mean'])
    )
    
    test_df['hmean_user_content_bin_accuracy_roll15'] = 2 * (
        (test_df['user_roll15_correctness'] * test_df['content_bin_mean']) /
        (test_df['user_roll15_correctness'] + test_df['content_bin_mean'])
    )
    
    
    # LINH'S ADDING FEATURES
    test2_df = add_user_features_without_update(test2_df)
    test2_df = add_other_features(test2_df)
    
    
    # PREDICT
    test_df[target] = 0.6*model_Son.predict(test_df[features]) + 0.4*model_Linh.predict(test2_df[features_Linh])
    # kaggle
    env.predict(test_df[['row_id', target]])

#     # emulator
#     set_predict(test_df.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(test_df))

  out=out, **kwargs)
  keepdims=keepdims)


CPU times: user 4.13 s, sys: 80.8 ms, total: 4.21 s
Wall time: 1.41 s
