# Inference notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
from matplotlib import cm
import gc 
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
%%time


# lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
# questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

def read_data(feature_engineering, nbr_row=14):
    
    feld_needed = ['timestamp', 
                   'user_id', 
                   'answered_correctly', 
                   'content_id', 
                   'content_type_id', 
                   'prior_question_elapsed_time', 
                   'prior_question_had_explanation',
#                   'task_container_id'
                  ]

    df = pd.read_pickle('../input/riiid-cross-validation-files/cv1_train.pickle')[feld_needed]
    df_valid = pd.read_pickle('../input/riiid-cross-validation-files/cv1_valid.pickle')[feld_needed]

    #--------------------------
    # Take part of the dataset

    if feature_engineering:
        nbr_row = str(nbr_row) + '000000'
        df = df.iloc[-int(nbr_row):]
        
    df = df[df.content_type_id == 0].reset_index(drop=True)
    df_valid = df_valid[df_valid.content_type_id == 0].reset_index(drop=True)
        
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = df['prior_question_elapsed_time'].dropna().mean()
        
    df.prior_question_elapsed_time = df.prior_question_elapsed_time.fillna(value=prior_question_elapsed_time_mean).astype('int32')
    df_valid.prior_question_elapsed_time = df_valid.prior_question_elapsed_time.fillna(value=prior_question_elapsed_time_mean).astype('int32')
    
    df.prior_question_had_explanation = df.prior_question_had_explanation.fillna(value=0).astype('int8')
    df_valid.prior_question_had_explanation = df_valid.prior_question_had_explanation.fillna(value=0).astype('int8')
    

    return df, df_valid, prior_question_elapsed_time_mean


# load the number of seed
train, valid, prior_question_elapsed_time_mean= read_data(True, 30)


gc.collect()

CPU times: user 4.05 s, sys: 8.82 s, total: 12.9 s
Wall time: 43.9 s


0

In [3]:
pickle.dump(prior_question_elapsed_time_mean, open('prior_question_elapsed_time_mean.p', 'wb'))

In [4]:
#this clears everything loaded in RAM, including the libraries
%reset -f

# Submit 

In [5]:
import time
time.sleep(10)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
from matplotlib import cm
import gc 
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle
import lightgbm as lgb
from sklearn.metrics import roc_auc_score


prior_question_elapsed_time_mean = pickle.load(open("prior_question_elapsed_time_mean.p", "rb"))
features = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/features.p", "rb"))

include = False
if include:
    pass
else:
    features.remove('correct_prob')
    
target = 'answered_correctly'

In [6]:
from collections import defaultdict
from tqdm.notebook import tqdm

def add_user_feats_without_update(df, answered_correctly_u_sum_dict, u_count_dict, answered_correctly_q_sum_dict, q_count_dict,elapse_time_u_sum, 
                                  had_explanation_u_sum, timestamp_u_dict, timestamp_u_incorrect_dict, 
                                  elapse_time_q_sum_dict, part_answered_correctly_dict, part_count_dict, bundle_answered_correctly_dict, 
                                  bundle_count_dict, timestamp_u_correct_dict, 
                                 answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict): 
    #answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict, 
    
    
    # User feats
    answered_correctly_u_sum = np.zeros(len(df), dtype=np.float32)
    u_count = np.zeros(len(df), dtype=np.float32)
    answered_correctly_u_avg = np.zeros(len(df), dtype=np.float32)
    # attempts
    answered_correctly_uq_attempt = np.zeros(len(df), dtype=np.float32)
    
    elapse_time_u_avg = np.zeros(len(df), dtype=np.float32)
    had_explanation_u_avg = np.zeros(len(df), dtype=np.float32)
    
    # Questions feats
    answered_correctly_q_sum = np.zeros(len(df), dtype=np.float32)
    q_count = np.zeros(len(df), dtype=np.float32)
    answered_correctly_q_avg = np.zeros(len(df), dtype=np.float32)
    elapse_time_q_avg = np.zeros(len(df), dtype=np.float32)
    
    # timestamp features
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
    timestamp_u_correct_recency = np.zeros(len(df), dtype = np.float32)

    
    #part
    part_avg = np.zeros(len(df), dtype = np.float32)
    bundle_avg = np.zeros(len(df), dtype = np.float32)
    user_part_avg = np.zeros(len(df), dtype=np.float32)

    #---------------------------------------------------


    for cnt,row in enumerate(tqdm(df[['user_id', 'content_id', 'prior_question_elapsed_time', 
                                      'prior_question_had_explanation', 'timestamp', 'part', 'bundle_id']].values)):
        
        # User calculation 
        if u_count_dict[row[0]] != 0:

            answered_correctly_u_avg[cnt] = answered_correctly_u_sum_dict[row[0]] / u_count_dict[row[0]]
            elapse_time_u_avg[cnt] = elapse_time_u_sum[row[0]] / u_count_dict[row[0]]
            had_explanation_u_avg[cnt] = had_explanation_u_sum[row[0]] / u_count_dict[row[0]]

        else:
            answered_correctly_u_avg[cnt] = np.nan
            elapse_time_u_avg[cnt] = np.nan
            had_explanation_u_avg[cnt] = np.nan
            
        # timestamp
        if len(timestamp_u_dict[row[0]]) == 0:
            timestamp_u_recency_1[cnt] = np.nan
            timestamp_u_recency_2[cnt] = np.nan
            timestamp_u_recency_3[cnt] = np.nan
            
        elif len(timestamp_u_dict[row[0]]) == 1:
            timestamp_u_recency_1[cnt] = row[4] - timestamp_u_dict[row[0]][0]
            timestamp_u_recency_2[cnt] = np.nan
            timestamp_u_recency_3[cnt] = np.nan
                                                              
        elif len(timestamp_u_dict[row[0]]) == 2:
            timestamp_u_recency_1[cnt] = row[4] - timestamp_u_dict[row[0]][1]
            timestamp_u_recency_2[cnt] = row[4] - timestamp_u_dict[row[0]][0]
            timestamp_u_recency_3[cnt] = np.nan
                                                              
        elif len(timestamp_u_dict[row[0]]) == 3:
            timestamp_u_recency_1[cnt] = row[4] - timestamp_u_dict[row[0]][2]
            timestamp_u_recency_2[cnt] = row[4] - timestamp_u_dict[row[0]][1]
            timestamp_u_recency_3[cnt] = row[4] - timestamp_u_dict[row[0]][0]
            
        if len(timestamp_u_incorrect_dict[row[0]]) == 0:
            timestamp_u_incorrect_recency[cnt] = np.nan
        
        else:
            timestamp_u_incorrect_recency[cnt] = row[4] - timestamp_u_incorrect_dict[row[0]][0]                              

        if len(timestamp_u_correct_dict[row[0]]) == 0:
            timestamp_u_correct_recency[cnt] = np.nan
        
        else:
            timestamp_u_correct_recency[cnt] = row[4] - timestamp_u_correct_dict[row[0]][0] 
            
        #---------------------------------------------------
        # Question calculation 

        if q_count_dict[row[1]] != 0:

            answered_correctly_q_avg[cnt] = answered_correctly_q_sum_dict[row[1]] / q_count_dict[row[1]]
            elapse_time_q_avg[cnt] = elapse_time_q_sum_dict[row[1]] / q_count_dict[row[1]]
            
        else:
            answered_correctly_q_avg[cnt] = np.nan
            elapse_time_q_avg[cnt] = np.nan
            
        #---------------------------------------------------
        # Part calculation 
        if part_count_dict[row[5]] != 0:
            part_avg[cnt] = part_answered_correctly_dict[row[5]] / part_count_dict[row[5]]
        
        else:
            part_avg[cnt] = np.nan
            
            
        if bundle_count_dict[row[6]] != 0:
            bundle_avg[cnt] = bundle_answered_correctly_dict[row[6]] / bundle_count_dict[row[6]]
        
        else:
            bundle_avg[cnt] = np.nan
            
        if user_part_count_dict[row[0]][row[5]] != 0:
            user_part_avg[cnt] = user_part_sum_dict[row[0]][row[5]] / user_part_count_dict[row[0]][row[5]]
        else:
            user_part_avg[cnt] = np.nan
            
        #----------------------------------------------
        #Attrib
        answered_correctly_uq_attempt[cnt] = answered_correctly_uq_attempt_dict[row[0]][row[1]]

    #---------------------------------------------------
    
    user_feats_df = pd.DataFrame({
#         'answered_correctly_sum_user': answered_correctly_u_sum,
#         'count_user': u_count,
        'answered_correctly_avg_u': answered_correctly_u_avg,
    #                                   'question_correct_sum': answered_correctly_q_sum,
    #                                   'question_count': q_count,
        'answered_correctly_uq_attempt': answered_correctly_uq_attempt,
        'question_avg': answered_correctly_q_avg,
        'elapse_time_u_avg':elapse_time_u_avg,
        'had_explanation_u_avg':had_explanation_u_avg,
        'timestamp_u_recency_1':timestamp_u_recency_1,
        'timestamp_u_recency_2':timestamp_u_recency_2,
        'timestamp_u_recency_3':timestamp_u_recency_3,
        'timestamp_u_incorrect_recency':timestamp_u_incorrect_recency,
        'timestamp_u_correct_recency':timestamp_u_correct_recency,
        'elapse_time_q_avg':elapse_time_q_avg,
        'part_avg':part_avg,
        'bundle_avg':bundle_avg,
        'user_part_avg':user_part_avg
        })
    

    df = pd.concat([df, user_feats_df], axis=1)
    
    #---------------------------------------------------
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.fillna(0, inplace=True)

    return df


def update_features(df, answered_correctly_u_sum_dict, u_count_dict, answered_correctly_q_sum_dict, q_count_dict,
                   timestamp_u_incorrect_dict, timestamp_u_dict, elapse_time_u_sum, had_explanation_u_sum, elapse_time_q_sum_dict, part_answered_correctly_dict, 
                    part_count_dict, bundle_answered_correctly_dict, bundle_count_dict, timestamp_u_correct_dict, 
                   answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict): #
    
    #answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict, 
    
    for row in df[['user_id','answered_correctly', 'content_id', 'content_type_id', 'timestamp', 'prior_question_elapsed_time', 
                                      'prior_question_had_explanation', 'part', 'bundle_id']].values:
        
        if row[3] == 0:
            
            answered_correctly_u_sum_dict[row[0]] += row[1]
            u_count_dict[row[0]] += 1
            elapse_time_q_sum_dict[row[2]] += row[5]
            
            answered_correctly_q_sum_dict[row[2]] += row[1]
            q_count_dict[row[1]] += 1
            
            answered_correctly_uq_attempt_dict[row[0]][row[2]] +=1
            
        
        if row[1] == 0:
            if len(timestamp_u_incorrect_dict[row[0]]) == 1:
                timestamp_u_incorrect_dict[row[0]].pop(0)
                timestamp_u_incorrect_dict[row[0]].append(row[4])
                
            else:
                timestamp_u_incorrect_dict[row[0]].append(row[4])
        
        if row[1] == 1:
            if len(timestamp_u_correct_dict[row[0]]) == 1:
                timestamp_u_correct_dict[row[0]].pop(0)
                timestamp_u_correct_dict[row[0]].append(row[4])
                
            else:
                timestamp_u_correct_dict[row[0]].append(row[4])     
        
                                                                                                                                                                                 
        # timestamp
        if len(timestamp_u_dict[row[0]]) == 3:
            timestamp_u_dict[row[0]].pop(0)
            timestamp_u_dict[row[0]].append(row[4])
        else:
            timestamp_u_dict[row[0]].append(row[4])
        
        
        elapse_time_u_sum[row[0]] += row[5]
        had_explanation_u_sum[row[0]] += row[6]
        
        part_answered_correctly_dict[row[7]] += row[1]
        part_count_dict[row[7]] += 1
        
        bundle_answered_correctly_dict[row[8]] += row[1]
        bundle_count_dict[row[8]] += 1
        
        user_part_sum_dict[row[0]][row[7]] += row[1]
        user_part_count_dict[row[0]][row[7]] +=1


In [7]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [8]:
import pickle
#import dict 

# attempt
with open('../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/answered_correctly_uq_attempt.pickle', 'rb') as f:
    answered_correctly_uq = pickle.load(f)
    
answered_correctly_uq_attempt_dict = defaultdict(lambda: defaultdict(int))
for key in tqdm(answered_correctly_uq.keys()):
    answered_correctly_uq_attempt_dict[key] = answered_correctly_uq[key]

del answered_correctly_uq
gc.collect()

# part user sum
with open('../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/user_part_sum.pickle', 'rb') as f:
    user_part_sum = pickle.load(f)
    
user_part_sum_dict = defaultdict(lambda: defaultdict(int))
for key in tqdm(user_part_sum.keys()):
    user_part_sum_dict[key] = user_part_sum[key]

del user_part_sum
gc.collect()

# part user count
with open('../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/user_part_count.pickle', 'rb') as f:
    user_part_cnt = pickle.load(f)
    
user_part_count_dict = defaultdict(lambda: defaultdict(int))
for key in tqdm(user_part_cnt.keys()):
    user_part_count_dict[key] = user_part_cnt[key]

del user_part_cnt
gc.collect()
    
    
answered_correctly_u_sum_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/answered_correctly_u_sum_dict.p", "rb"))

u_count_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/u_count_dict.p", "rb"))

answered_correctly_q_sum_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/answered_correctly_q_sum_dict.p", "rb"))

q_count_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/q_count_dict.p", "rb"))

elapse_time_u_sum = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/elapse_time_u_sum.p", "rb"))

had_explanation_u_sum = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/had_explanation_u_sum.p", "rb"))

timestamp_u_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/timestamp_u_dict.p", "rb"))

timestamp_u_incorrect_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/timestamp_u_incorrect_dict.p", "rb"))

elapse_time_q_sum_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/elapse_time_q_sum_dict.p", "rb"))

part_answered_correctly_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/part_answered_correctly_dict.p", "rb"))

part_count_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/part_count_dict.p", "rb"))

bundle_answered_correctly_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/bundle_answered_correctly_dict.p", "rb"))

bundle_count_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/bundle_count_dict.p", "rb"))

timestamp_u_correct_dict = pickle.load(open("../input/riiid-loop-lgbm-main-dish-preparation-of-the-meal/timestamp_u_correct_dict.p", "rb"))

# Model 
kmeans_q_avg = pickle.load(open("../input/riiid-create-side-dish/kmeans_q_avg.pkl", "rb"))

HBox(children=(FloatProgress(value=0.0, max=121411.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=121411.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=121411.0), HTML(value='')))




In [9]:
# Add questions

question = pd.read_csv('../input/riiid-create-side-dish/questions_complet.csv')[['question_id', 'part', 'bundle_id', 'tags_encoded', 'tag_1', 'tag_2', 'tag_3', 'tags_lsi']]

In [10]:
# Load Model
import joblib

model = joblib.load('../input/riiid-loop-lgbm-a-model-dish/lgbm.pkl')

In [11]:
previous_test_set = None

for (test_df, sample_prediction_df) in iter_test:
    
    if previous_test_set is not None:
        
        previous_test_set[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        
        previous_test_set = previous_test_set[previous_test_set['content_type_id'] == 0].reset_index(drop=True)

        previous_test_set = pd.merge(previous_test_set, question, how='left', left_on='content_id', right_on='question_id')
        
        previous_test_set.prior_question_elapsed_time = previous_test_set.prior_question_elapsed_time.fillna(value=prior_question_elapsed_time_mean).astype('int32')
        previous_test_set.prior_question_had_explanation = previous_test_set.prior_question_had_explanation.fillna(value=0).astype('int8')
        
        # update
        update_features(previous_test_set, answered_correctly_u_sum_dict, u_count_dict, answered_correctly_q_sum_dict, q_count_dict,
                   timestamp_u_incorrect_dict, timestamp_u_dict, elapse_time_u_sum, had_explanation_u_sum, elapse_time_q_sum_dict, part_answered_correctly_dict, part_count_dict, 
                        bundle_answered_correctly_dict, bundle_count_dict, timestamp_u_correct_dict,
                       answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict)
        
        
    
    previous_test_set = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    test_df = pd.merge(test_df, question, how='left', left_on='content_id', right_on='question_id')
    
    test_df.prior_question_elapsed_time = test_df.prior_question_elapsed_time.fillna(value=prior_question_elapsed_time_mean).astype('int32')
    test_df.prior_question_had_explanation = test_df.prior_question_had_explanation.fillna(value=0).astype('int8')

    #feat add
    test_df = add_user_feats_without_update(test_df, answered_correctly_u_sum_dict, u_count_dict, answered_correctly_q_sum_dict, q_count_dict, 
                                            elapse_time_u_sum, had_explanation_u_sum, timestamp_u_dict, 
                                            timestamp_u_incorrect_dict, elapse_time_q_sum_dict, part_answered_correctly_dict, part_count_dict, 
                                            bundle_answered_correctly_dict, bundle_count_dict, timestamp_u_correct_dict,
                                           answered_correctly_uq_attempt_dict, user_part_sum_dict, user_part_count_dict) 
    
    #cluster
    test_df['q_avg_cluster'] = kmeans_q_avg.predict(test_df[['question_avg', 'bundle_avg', 'part_avg']].values)
    test_df['q_avg_cluster'] = test_df['q_avg_cluster'].replace(dict(enumerate(kmeans_q_avg.cluster_centers_.flatten(), 0)))
    
    
#     test_df["task_con"] = (test_df.timestamp != test_df.timestamp.shift()).cumsum()
#     test_df = test_df.groupby(["user_id","task_con"])[features].transform("first")
    
    
    test_df['answered_correctly'] = model.predict(test_df[features])
    env.predict(test_df[['row_id', 'answered_correctly']])
    

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33.0), HTML(value='')))


