In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from collections import defaultdict
import dill
from tqdm.notebook import tqdm
import lightgbm as lgb

# Load data

In [2]:
%%time
columns = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train_df = pd.read_pickle('train/cv1_train.pickle')[columns]
valid_df = pd.read_pickle('train/cv1_valid.pickle')[columns]

print('train_df size: {}'.format(train_df.shape))
print('valid_df size: {}'.format(valid_df.shape))

train_df size: (98730332, 8)
valid_df size: (2500000, 8)
CPU times: user 2.12 s, sys: 6.47 s, total: 8.59 s
Wall time: 8.59 s


In [3]:
prior_question_elapsed_time_mean = train_df['prior_question_elapsed_time'].dropna().values.mean()
prior_question_elapsed_time_mean

25439.41

# Load dicts

In [4]:
content_part_dict = dill.load(open('dicts/content_part_dict_file', 'rb'))

In [5]:
lecture_part_dict = dill.load(open('dicts/lecture_part_dict_file', 'rb'))

In [6]:
lecture_type_dict = dill.load(open('dicts/lecture_type_dict_file', 'rb'))

# Loop features

In [9]:
debug = False

if debug:
    train_df = train_df[-2000000:]
    valid_df = valid_df[:10000]
    train_tmp_df = train_df
    valid_tmp_df = valid_df
    train_size = 2000000
    valid_size = 10000
else:
    train_size = train_df.shape[0]
    valid_size = valid_df.shape[0]

## add_content

In [None]:
def add_content(df):
    answered_user_count_arr = np.zeros(len(df), dtype=np.int32)
    answered_correctly_user_sum_arr = np.zeros(len(df), dtype=np.int32)
    answered_correctly_user_mean_arr = np.zeros(len(df), dtype=np.float32)

    for idx, (user_id, content_type_id, answered_correctly) in enumerate(tqdm(df[['user_id', 'content_type_id', 'answered_correctly']].values)):
        if content_type_id == 0:
            # add feature
            answered_user_count_arr[idx] = answered_user_count_dict[user_id]
            answered_correctly_user_sum_arr[idx] = answered_correctly_user_sum_dict[user_id]
            if answered_user_count_arr[idx] != 0:
                answered_correctly_user_mean_arr[idx] = answered_correctly_user_sum_arr[idx] / answered_user_count_arr[idx]
            
            # update dict
            answered_user_count_dict[user_id] += 1
            answered_correctly_user_sum_dict[user_id] += answered_correctly
        else:
            pass
    
    answered_user_count_arr = answered_user_count_arr[-train_size:]
    answered_correctly_user_sum_arr = answered_correctly_user_sum_arr[-train_size:]
    answered_correctly_user_mean_arr = answered_correctly_user_mean_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'answered_user_count': answered_user_count_arr,
        'answered_correctly_user_sum': answered_correctly_user_sum_arr,
        'answered_correctly_user_mean': answered_correctly_user_mean_arr
    })

    del answered_user_count_arr, answered_correctly_user_sum_arr, answered_correctly_user_mean_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [None]:
%%time
answered_user_count_dict = defaultdict(int)
answered_correctly_user_sum_dict = defaultdict(int)

train_user_content_feats_df = add_content(train_df)
valid_user_content_feats_df = add_content(valid_df)

# train_df = add_content(train_df)
# valid_df = add_content(valid_df)

In [None]:
train_user_content_feats_df

In [None]:
%%time
train_user_content_feats_df.to_pickle('loop_features/train_user_content_feats_df_90M.pkl')
valid_user_content_feats_df.to_pickle('loop_features/valid_user_content_feats_df_90M.pkl')

In [None]:
%%time
dill.dump(answered_user_count_dict, open('dicts/answered_user_count_dict_file', 'wb'))
dill.dump(answered_correctly_user_sum_dict, open('dicts/answered_correctly_user_sum_dict_file', 'wb'))

## add_rolling_stats

In [9]:
def add_rolling_stats(df):
    df = df[-train_size:].reset_index(drop=True) # only features needed in train_size
    
    lag1_arr = np.zeros(len(df), dtype=np.int8)
    lag2_arr = np.zeros(len(df), dtype=np.int8)
    lag3_arr = np.zeros(len(df), dtype=np.int8)
    lag4_arr = np.zeros(len(df), dtype=np.int8)
    lag5_arr = np.zeros(len(df), dtype=np.int8)
    # lag15_arr = np.zeros(len(df), dtype=np.int8)
    # lag30_arr = np.zeros(len(df), dtype=np.int8)
    rolling5_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling5_std_arr = np.zeros(len(df), dtype=np.float32)
    rolling15_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling15_std_arr = np.zeros(len(df), dtype=np.float32)
    rolling30_mean_arr = np.zeros(len(df), dtype=np.float32)
    rolling30_std_arr = np.zeros(len(df), dtype=np.float32)
    wrong_answered_user_count_arr = np.zeros(len(df), dtype=np.int8)

    for idx, (user_id, content_type_id, answered_correctly) in enumerate(tqdm(df[['user_id', 'content_type_id', 'answered_correctly']].values)):
        if content_type_id == 0:
            # add feature
            lag1_arr[idx] = lag1_dict[user_id]
            window_id = np.append(window_dict[user_id], lag1_dict[user_id])
            window_id = window_id[-30:]
            if len(window_id) >= 2:
                lag2_arr[idx] = window_id[-2]
            if len(window_id) >= 3:
                lag3_arr[idx] = window_id[-3]
            if len(window_id) >= 4:
                lag4_arr[idx] = window_id[-4]
            if len(window_id) >= 5:
                lag5_arr[idx] = window_id[-5]
            # if len(window_id) >= 15:
            #     lag15_arr[idx] = window_id[-15]
            # if len(window_id) >= 30:
            #     lag30_arr[idx] = window_id[-30]
            rolling5_mean_arr[idx] = np.mean(window_id[-5:])
            rolling5_std_arr[idx] = np.std(window_id[-5:])
            rolling15_mean_arr[idx] = np.mean(window_id[-15:])
            rolling15_std_arr[idx] = np.std(window_id[-15:])
            rolling30_mean_arr[idx] = np.mean(window_id[-30:])
            rolling30_std_arr[idx] = np.std(window_id[-30:])

            if lag1_dict[user_id] == 0:
                # add feature
                wrong_answered_user_count_arr[idx] = wrong_answered_user_count_dict[user_id]
                # update dict
                wrong_answered_user_count_dict[user_id] += 1
            else:
                # update dict
                wrong_answered_user_count_dict[user_id] = 1
            
            # update dict
            window_dict[user_id] = window_id
            lag1_dict[user_id] = answered_correctly
        else:
            pass
    
    user_feats_df = pd.DataFrame({
        'lag1': lag1_arr,
        'lag2': lag2_arr,
        'lag3': lag3_arr,
        'lag4': lag4_arr,
        'lag5': lag5_arr,
        # 'lag15': lag15_arr,
        # 'lag30': lag30_arr,
        'rolling5_mean': rolling5_mean_arr,
        'rolling5_std': rolling5_std_arr,
        'rolling15_mean': rolling15_mean_arr,
        'rolling15_std': rolling15_std_arr,
        'rolling30_mean': rolling30_mean_arr,
        'rolling30_std': rolling30_std_arr,
        'wrong_answered_user_count': wrong_answered_user_count_arr
    })

    del lag1_arr, lag2_arr, lag3_arr, lag4_arr, lag5_arr, rolling5_mean_arr, rolling5_std_arr, rolling15_mean_arr, rolling15_std_arr, rolling30_mean_arr, rolling30_std_arr, wrong_answered_user_count_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [10]:
%%time
lag1_dict = defaultdict(int)
window_dict = defaultdict(list)
wrong_answered_user_count_dict = defaultdict(int)

train_rolling_stats_feats_df = add_rolling_stats(train_df)
valid_rolling_stats_feats_df = add_rolling_stats(valid_df)

# train_df = add_rolling_stats(train_df)
# valid_df = add_rolling_stats(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 2h 53min 20s, sys: 10min 32s, total: 3h 3min 52s
Wall time: 2h 49min 57s


In [11]:
train_rolling_stats_feats_df

Unnamed: 0,lag1,lag2,lag3,lag4,lag5,rolling5_mean,rolling5_std,rolling15_mean,rolling15_std,rolling30_mean,rolling30_std,wrong_answered_user_count
0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,1,0,0,0,0,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0
2,1,1,0,0,0,0.666667,0.471405,0.666667,0.471405,0.666667,0.471405,0
3,1,1,1,0,0,0.750000,0.433013,0.750000,0.433013,0.750000,0.433013,0
4,1,1,1,1,0,0.800000,0.400000,0.800000,0.400000,0.800000,0.400000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
98730327,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
98730328,1,1,1,1,0,0.800000,0.400000,0.733333,0.442217,0.700000,0.458258,0
98730329,1,1,1,1,1,1.000000,0.000000,0.866667,0.339935,0.733333,0.442217,0
98730330,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [12]:
%%time
train_rolling_stats_feats_df.to_pickle('loop_features/train_rolling_stats_feats_df_90M.pkl')
valid_rolling_stats_feats_df.to_pickle('loop_features/valid_rolling_stats_feats_df_90M.pkl')

CPU times: user 1.24 s, sys: 5.11 s, total: 6.35 s
Wall time: 6.34 s


In [13]:
%%time
dill.dump(lag1_dict, open('dicts/lag1_dict_file', 'wb'))
dill.dump(window_dict, open('dicts/window_dict_file', 'wb'))
dill.dump(wrong_answered_user_count_dict, open('dicts/wrong_answered_user_count_dict_file', 'wb'))

CPU times: user 39.3 s, sys: 906 ms, total: 40.3 s
Wall time: 40.3 s


## add_content_part

In [None]:
def add_content_part(df):
    # lag1_part_arr = np.zeros(len(df), dtype=np.int8)
    answered_user_part1_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part2_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part3_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part4_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part5_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part6_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_user_part7_count_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part1_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part2_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part3_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part4_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part5_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part6_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part7_sum_arr = np.zeros(len(df), dtype=np.int16)
    answered_correctly_user_part_mean_arr = np.zeros(len(df), dtype=np.float32)
    # answered_correctly_user_part_std_arr = np.zeros(len(df), dtype=np.float32)

    for idx, (user_id, content_type_id, content_id, answered_correctly) in enumerate(tqdm(df[['user_id', 'content_type_id', 'content_id', 'answered_correctly']].values)):
        if content_type_id == 0:
            part_id = content_part_dict[content_id]
            # add feature
            # lag1_part_arr[idx] = lag1_part_dict[user_id][part_id]
            answered_user_part1_count_arr[idx] = answered_user_part_count_dict[user_id][1]
            answered_user_part2_count_arr[idx] = answered_user_part_count_dict[user_id][2]
            answered_user_part3_count_arr[idx] = answered_user_part_count_dict[user_id][3]
            answered_user_part4_count_arr[idx] = answered_user_part_count_dict[user_id][4]
            answered_user_part5_count_arr[idx] = answered_user_part_count_dict[user_id][5]
            answered_user_part6_count_arr[idx] = answered_user_part_count_dict[user_id][6]
            answered_user_part7_count_arr[idx] = answered_user_part_count_dict[user_id][7]
            answered_correctly_user_part1_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][1]
            answered_correctly_user_part2_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][2]
            answered_correctly_user_part3_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][3]
            answered_correctly_user_part4_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][4]
            answered_correctly_user_part5_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][5]
            answered_correctly_user_part6_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][6]
            answered_correctly_user_part7_sum_arr[idx] = answered_correctly_user_part_sum_dict[user_id][7]
            if answered_user_part_count_dict[user_id][part_id] != 0:
                answered_correctly_user_part_mean_arr[idx] = answered_correctly_user_part_sum_dict[user_id][part_id] / answered_user_part_count_dict[user_id][part_id]
            # answered_correctly_user_part_std_arr[idx] = np.sqrt((lag1_part_arr[idx] - answered_correctly_user_part_mean_arr[idx])**2)
            
            # update dict
            # lag1_part_dict[user_id][part_id] = answered_correctly
            answered_user_part_count_dict[user_id][part_id] += 1
            answered_correctly_user_part_sum_dict[user_id][part_id] += answered_correctly
        else:
            pass

    # lag1_part_arr = lag1_part_arr[-train_size:]
    answered_user_part1_count_arr = answered_user_part1_count_arr[-train_size:]
    answered_user_part2_count_arr = answered_user_part2_count_arr[-train_size:]
    answered_user_part3_count_arr = answered_user_part3_count_arr[-train_size:]
    answered_user_part4_count_arr = answered_user_part4_count_arr[-train_size:]
    answered_user_part5_count_arr = answered_user_part5_count_arr[-train_size:]
    answered_user_part6_count_arr = answered_user_part6_count_arr[-train_size:]
    answered_user_part7_count_arr = answered_user_part7_count_arr[-train_size:]
    answered_correctly_user_part1_sum_arr = answered_correctly_user_part1_sum_arr[-train_size:]
    answered_correctly_user_part2_sum_arr = answered_correctly_user_part2_sum_arr[-train_size:]
    answered_correctly_user_part3_sum_arr = answered_correctly_user_part3_sum_arr[-train_size:]
    answered_correctly_user_part4_sum_arr = answered_correctly_user_part4_sum_arr[-train_size:]
    answered_correctly_user_part5_sum_arr = answered_correctly_user_part5_sum_arr[-train_size:]
    answered_correctly_user_part6_sum_arr = answered_correctly_user_part6_sum_arr[-train_size:]
    answered_correctly_user_part7_sum_arr = answered_correctly_user_part7_sum_arr[-train_size:]
    answered_correctly_user_part_mean_arr = answered_correctly_user_part_mean_arr[-train_size:]
    # answered_correctly_user_part_std_arr = answered_correctly_user_part_std_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        # 'lag1_part': lag1_part_arr,
        'answered_user_part1_count': answered_user_part1_count_arr,
        'answered_user_part2_count': answered_user_part2_count_arr,
        'answered_user_part3_count': answered_user_part3_count_arr,
        'answered_user_part4_count': answered_user_part4_count_arr,
        'answered_user_part5_count': answered_user_part5_count_arr,
        'answered_user_part6_count': answered_user_part6_count_arr,
        'answered_user_part7_count': answered_user_part7_count_arr,
        'answered_correctly_user_part1_sum': answered_correctly_user_part1_sum_arr,
        'answered_correctly_user_part2_sum': answered_correctly_user_part2_sum_arr,
        'answered_correctly_user_part3_sum': answered_correctly_user_part3_sum_arr,
        'answered_correctly_user_part4_sum': answered_correctly_user_part4_sum_arr,
        'answered_correctly_user_part5_sum': answered_correctly_user_part5_sum_arr,
        'answered_correctly_user_part6_sum': answered_correctly_user_part6_sum_arr,
        'answered_correctly_user_part7_sum': answered_correctly_user_part7_sum_arr,
        'answered_correctly_user_part_mean': answered_correctly_user_part_mean_arr,
        # 'answered_correctly_user_part_std': answered_correctly_user_part_std_arr
    })

    del answered_user_part1_count_arr, answered_user_part2_count_arr, answered_user_part3_count_arr, answered_user_part4_count_arr, answered_user_part5_count_arr, answered_user_part6_count_arr, answered_user_part7_count_arr, answered_correctly_user_part1_sum_arr, answered_correctly_user_part2_sum_arr, answered_correctly_user_part3_sum_arr, answered_correctly_user_part4_sum_arr, answered_correctly_user_part5_sum_arr, answered_correctly_user_part6_sum_arr, answered_correctly_user_part7_sum_arr, answered_correctly_user_part_mean_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [None]:
%%time
answered_user_part_count_dict = defaultdict(lambda: defaultdict(int))
answered_correctly_user_part_sum_dict = defaultdict(lambda: defaultdict(int))

train_user_content_part_feats_df = add_content_part(train_df)
valid_user_content_part_feats_df = add_content_part(valid_df)

# train_df = add_content_part(train_df)
# valid_df = add_content_part(valid_df)

In [None]:
train_user_content_part_feats_df

In [None]:
%%time
train_user_content_part_feats_df.to_pickle('loop_features/train_user_content_part_feats_df_90M.pkl')
valid_user_content_part_feats_df.to_pickle('loop_features/valid_user_content_part_feats_df_90M.pkl')

In [None]:
%%time
dill.dump(answered_user_part_count_dict, open('dicts/answered_user_part_count_dict_file', 'wb'))
dill.dump(answered_correctly_user_part_sum_dict, open('dicts/answered_correctly_user_part_sum_dict_file', 'wb'))

## add_content_bundle (new)

In [8]:
def add_content_bundle(df):
    bundle_user_count_arr = np.zeros(len(df), dtype=np.int32)
    bundle_user_count_bins_arr = np.zeros(len(df), dtype=np.int8)

    for idx, (user_id, content_type_id, content_id) in enumerate(tqdm(df[['user_id', 'content_type_id', 'content_id']].values)):
        if content_type_id == 0:
            # add feature
            bundle_user_count_arr[idx] = bundle_user_count_dict[user_id]
            if bundle_user_count_arr[idx] > 30 and bundle_user_count_arr[idx] <= 125:
                bundle_user_count_bins_arr[idx] = 1
            if bundle_user_count_arr[idx] > 125 and bundle_user_count_arr[idx] <= 487:
                bundle_user_count_bins_arr[idx] = 2
            if bundle_user_count_arr[idx] > 487 and bundle_user_count_arr[idx] <= 1451:
                bundle_user_count_bins_arr[idx] = 3
            if bundle_user_count_arr[idx] > 1451:
                bundle_user_count_bins_arr[idx] = 4

            # update dict
            bundle_user_count_dict[user_id] += 1
        else:
            pass
    
    bundle_user_count_arr = bundle_user_count_arr[-train_size:]
    bundle_user_count_bins_arr = bundle_user_count_bins_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        # 'bundle_user_count': bundle_user_count_arr,
        'bundle_user_count_bins': bundle_user_count_bins_arr
    })

    del bundle_user_count_arr, bundle_user_count_bins_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [12]:
%%time
bundle_user_count_dict = defaultdict(int)

train_user_content_bundle_feats_df = add_content_bundle(train_df)
valid_user_content_bundle_feats_df = add_content_bundle(valid_df)

# train_df = add_content_bundle(train_df)
# valid_df = add_content_bundle(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 19min 58s, sys: 5.82 s, total: 20min 3s
Wall time: 19min 57s


In [13]:
train_user_content_bundle_feats_df

Unnamed: 0,bundle_user_count_bins
0,0
1,0
2,0
3,0
4,0
...,...
98730327,0
98730328,4
98730329,3
98730330,0


In [14]:
%%time
train_user_content_bundle_feats_df.to_pickle('loop_features/train_user_content_bundle_feats_df_90M.pkl')
valid_user_content_bundle_feats_df.to_pickle('loop_features/valid_user_content_bundle_feats_df_90M.pkl')

CPU times: user 48.9 ms, sys: 172 ms, total: 221 ms
Wall time: 221 ms


In [15]:
%%time
dill.dump(bundle_user_count_dict, open('dicts/bundle_user_count_dict_file', 'wb'))

CPU times: user 6.73 s, sys: 148 ms, total: 6.87 s
Wall time: 6.87 s


## add_attempts

In [19]:
def add_attempts(df):
    attempts_arr = np.zeros(len(df), dtype=np.int8)

    for idx, (user_id, content_type_id, content_id) in enumerate(tqdm(df[['user_id', 'content_type_id', 'content_id']].values)):
        if content_type_id == 0:
            # add feature
            attempts_arr[idx] = attempts_dict[user_id][content_id]
            # update dict
            attempts_dict[user_id][content_id] += 1
        else:
            pass

    attempts_arr = attempts_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'attempts': attempts_arr
    })

    del attempts_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [20]:
%%time
attempts_dict = defaultdict(lambda: defaultdict(int))

train_user_attempts_feats_df = add_attempts(train_df)
valid_user_attempts_feats_df = add_attempts(valid_df)

# train_df = add_attempts(train_df)
# valid_df = add_attempts(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 5min 39s, sys: 2.26 s, total: 5min 41s
Wall time: 5min 40s


In [21]:
train_user_attempts_feats_df

Unnamed: 0,attempts
0,0
1,0
2,0
3,0
4,0
...,...
98730327,0
98730328,0
98730329,0
98730330,0


In [22]:
%%time
train_user_attempts_feats_df.to_pickle('loop_features/train_user_attempts_feats_df_90M.pkl')
valid_user_attempts_feats_df.to_pickle('loop_features/valid_user_attempts_feats_df_90M.pkl')

CPU times: user 39.5 ms, sys: 171 ms, total: 210 ms
Wall time: 210 ms


In [23]:
%%time
dill.dump(attempts_dict, open('dicts/attempts_dict_file', 'wb'))

CPU times: user 32min 34s, sys: 29 s, total: 33min 3s
Wall time: 33min 2s


## add_lecture

In [5]:
def add_lecture(df):
    lecture_user_count_arr = np.zeros(len(df), dtype=np.int16)

    for idx, (user_id, content_id, content_type_id) in enumerate(tqdm(df[['user_id', 'content_id', 'content_type_id']].values)):
        # add feature
        lecture_user_count_arr[idx] = lecture_user_count_dict[user_id]
        if content_type_id == 0:
            pass
        else:
            # update dict
            lecture_user_count_dict[user_id] += 1

    lecture_user_count_arr = lecture_user_count_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'lecture_user_count': lecture_user_count_arr
    })

    del lecture_user_count_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [6]:
%%time
lecture_user_count_dict = defaultdict(int)

train_user_lecture_feats_df = add_lecture(train_df)
valid_user_lecture_feats_df = add_lecture(valid_df)

# train_df = add_lecture(train_df)
# valid_df = add_lecture(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 5min 21s, sys: 1.39 s, total: 5min 22s
Wall time: 5min 21s


In [7]:
train_user_lecture_feats_df

Unnamed: 0,lecture_user_count
0,0
1,0
2,0
3,0
4,0
...,...
98730327,44
98730328,82
98730329,1
98730330,24


In [8]:
%%time
train_user_lecture_feats_df.to_pickle('loop_features/train_user_lecture_feats_df_90M.pkl')
valid_user_lecture_feats_df.to_pickle('loop_features/valid_user_lecture_feats_df_90M.pkl')

CPU times: user 129 ms, sys: 325 ms, total: 453 ms
Wall time: 452 ms


In [9]:
%%time
dill.dump(lecture_user_count_dict, open('dicts/lecture_user_count_dict_file', 'wb'))

CPU times: user 6.54 s, sys: 144 ms, total: 6.68 s
Wall time: 6.68 s


## add_lecture_part

In [13]:
def add_lecture_part(df):
    lecture_user_part1_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part2_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part3_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part4_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part5_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part6_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_part7_count_arr = np.zeros(len(df), dtype=np.int8)
    
    for idx, (user_id, content_id, content_type_id) in enumerate(tqdm(df[['user_id', 'content_id', 'content_type_id']].values)):
        # add feature
        lecture_user_part1_count_arr[idx] = lecture_user_part_count_dict[user_id][1]
        lecture_user_part2_count_arr[idx] = lecture_user_part_count_dict[user_id][2]
        lecture_user_part3_count_arr[idx] = lecture_user_part_count_dict[user_id][3]
        lecture_user_part4_count_arr[idx] = lecture_user_part_count_dict[user_id][4]
        lecture_user_part5_count_arr[idx] = lecture_user_part_count_dict[user_id][5]
        lecture_user_part6_count_arr[idx] = lecture_user_part_count_dict[user_id][6]
        lecture_user_part7_count_arr[idx] = lecture_user_part_count_dict[user_id][7]
        
        if content_type_id == 0:
            pass
        else:
            lect_part_id = lecture_part_dict[content_id]
            # update dict
            lecture_user_part_count_dict[user_id][lect_part_id] += 1

    lecture_user_part1_count_arr = lecture_user_part1_count_arr[-train_size:]
    lecture_user_part2_count_arr = lecture_user_part2_count_arr[-train_size:]
    lecture_user_part3_count_arr = lecture_user_part3_count_arr[-train_size:]
    lecture_user_part4_count_arr = lecture_user_part4_count_arr[-train_size:]
    lecture_user_part5_count_arr = lecture_user_part5_count_arr[-train_size:]
    lecture_user_part6_count_arr = lecture_user_part6_count_arr[-train_size:]
    lecture_user_part7_count_arr = lecture_user_part7_count_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'lecture_user_part1_count': lecture_user_part1_count_arr,
        'lecture_user_part2_count': lecture_user_part2_count_arr,
        'lecture_user_part3_count': lecture_user_part3_count_arr,
        'lecture_user_part4_count': lecture_user_part4_count_arr,
        'lecture_user_part5_count': lecture_user_part5_count_arr,
        'lecture_user_part6_count': lecture_user_part6_count_arr,
        'lecture_user_part7_count': lecture_user_part7_count_arr
    })

    del lecture_user_part1_count_arr, lecture_user_part2_count_arr, lecture_user_part3_count_arr, lecture_user_part4_count_arr, lecture_user_part5_count_arr, lecture_user_part6_count_arr, lecture_user_part7_count_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [14]:
%%time
lecture_user_part_count_dict = defaultdict(lambda: defaultdict(int))

train_user_lecture_part_feats_df = add_lecture_part(train_df)
valid_user_lecture_part_feats_df = add_lecture_part(valid_df)

# train_df = add_lecture_part(train_df)
# valid_df = add_lecture_part(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 6min 57s, sys: 2.75 s, total: 7min
Wall time: 6min 58s


In [15]:
train_user_lecture_part_feats_df

Unnamed: 0,lecture_user_part1_count,lecture_user_part2_count,lecture_user_part3_count,lecture_user_part4_count,lecture_user_part5_count,lecture_user_part6_count,lecture_user_part7_count
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
98730327,4,2,1,3,21,12,1
98730328,1,2,5,1,36,37,0
98730329,0,0,0,0,1,0,0
98730330,1,0,0,4,18,1,0


In [16]:
%%time
train_user_lecture_part_feats_df.to_pickle('loop_features/train_user_lecture_part_feats_df_90M.pkl')
valid_user_lecture_part_feats_df.to_pickle('loop_features/valid_user_lecture_part_feats_df_90M.pkl')

CPU times: user 349 ms, sys: 1.19 s, total: 1.54 s
Wall time: 1.54 s


In [17]:
%%time
dill.dump(lecture_user_part_count_dict, open('dicts/lecture_user_part_count_dict_file', 'wb'))

CPU times: user 19.6 s, sys: 288 ms, total: 19.9 s
Wall time: 19.9 s


## add_lecture_type

In [19]:
def add_lecture_type(df):
    lecture_user_concept_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_solving_question_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_intention_count_arr = np.zeros(len(df), dtype=np.int8)
    lecture_user_starter_count_arr = np.zeros(len(df), dtype=np.int8)

    for idx, (user_id, content_id, content_type_id) in enumerate(tqdm(df[['user_id', 'content_id', 'content_type_id']].values)):
        # add feature
        lecture_user_concept_count_arr[idx] = lecture_user_type_count_dict[user_id]['concept']
        lecture_user_solving_question_count_arr[idx] = lecture_user_type_count_dict[user_id]['solving_question']
        lecture_user_intention_count_arr[idx] = lecture_user_type_count_dict[user_id]['intention']
        lecture_user_starter_count_arr[idx] = lecture_user_type_count_dict[user_id]['starter']

        if content_type_id == 0:
            pass
        else:
            type_id = lecture_type_dict[content_id]
            # update dict
            lecture_user_type_count_dict[user_id][type_id] += 1
        
    lecture_user_concept_count_arr = lecture_user_concept_count_arr[-train_size:]
    lecture_user_solving_question_count_arr = lecture_user_solving_question_count_arr[-train_size:]
    lecture_user_intention_count_arr = lecture_user_intention_count_arr[-train_size:]
    lecture_user_starter_count_arr = lecture_user_starter_count_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'lecture_user_concept_count': lecture_user_concept_count_arr,
        'lecture_user_solving_question_count': lecture_user_solving_question_count_arr,
        'lecture_user_intention_count': lecture_user_intention_count_arr,
        # 'lecture_user_starter_count': lecture_user_starter_count_arr
    })

    del lecture_user_concept_count_arr, lecture_user_solving_question_count_arr, lecture_user_intention_count_arr, lecture_user_starter_count_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [20]:
%%time
lecture_user_type_count_dict = defaultdict(lambda: defaultdict(int))

train_user_lecture_type_feats_df = add_lecture_type(train_df)
valid_user_lecture_type_feats_df = add_lecture_type(valid_df)

# train_df = add_lecture_type(train_df)
# valid_df = add_lecture_type(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 6min 12s, sys: 2.24 s, total: 6min 15s
Wall time: 6min 13s


In [21]:
train_user_lecture_type_feats_df

Unnamed: 0,lecture_user_concept_count,lecture_user_solving_question_count,lecture_user_intention_count
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
98730327,30,13,1
98730328,61,19,2
98730329,1,0,0
98730330,23,1,0


In [22]:
%%time
train_user_lecture_type_feats_df.to_pickle('loop_features/train_user_lecture_type_feats_df_90M.pkl')
valid_user_lecture_type_feats_df.to_pickle('loop_features/valid_user_lecture_type_feats_df_90M.pkl')

CPU times: user 147 ms, sys: 509 ms, total: 656 ms
Wall time: 654 ms


In [23]:
%%time
dill.dump(lecture_user_type_count_dict, open('dicts/lecture_user_type_count_dict_file', 'wb'))

CPU times: user 16.1 s, sys: 256 ms, total: 16.4 s
Wall time: 16.4 s


## add_elapsed_time

In [24]:
def add_elapsed_time(df):
    prior_elapsed_time_sum_arr = np.zeros(len(df), dtype=np.float32)
    prior_elapsed_time_diff_arr = np.zeros(len(df), dtype=np.float32)

    for idx, (user_id, content_type_id, prior_question_elapsed_time) in enumerate(tqdm(df[['user_id', 'content_type_id', 'prior_question_elapsed_time']].values)):
        if content_type_id == 0:
            if prior_question_elapsed_time != prior_question_elapsed_time: # check nan
                # add feature
                prior_elapsed_time_sum_arr[idx] = prior_elapsed_time_sum_dict[user_id]
                prior_elapsed_time_diff_arr[idx] = prior_elapsed_time_dict[user_id]
            else:
                # add feature
                prior_elapsed_time_sum_arr[idx] = prior_question_elapsed_time/(1000*60) + prior_elapsed_time_sum_dict[user_id]
                prior_elapsed_time_diff_arr[idx] = prior_question_elapsed_time/(1000*60) - prior_elapsed_time_dict[user_id]
                # update dict
                prior_elapsed_time_sum_dict[user_id] += prior_question_elapsed_time/(1000*60)
                prior_elapsed_time_dict[user_id] = prior_question_elapsed_time/(1000*60)
        else:
            pass
    
    prior_elapsed_time_sum_arr = prior_elapsed_time_sum_arr[-train_size:]
    prior_elapsed_time_diff_arr = prior_elapsed_time_diff_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'prior_elapsed_time_sum': prior_elapsed_time_sum_arr,
        'prior_elapsed_time_diff': prior_elapsed_time_diff_arr
    })

    del prior_elapsed_time_sum_arr, prior_elapsed_time_diff_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [25]:
%%time
prior_elapsed_time_sum_dict = defaultdict(float)
prior_elapsed_time_dict = defaultdict(float)

train_user_elapsed_time_feats_df = add_elapsed_time(train_df)
valid_user_elapsed_time_feats_df = add_elapsed_time(valid_df)

# train_df = add_elapsed_time(train_df)
# valid_df = add_elapsed_time(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 7min, sys: 2.43 s, total: 7min 2s
Wall time: 7min 1s


In [26]:
train_user_elapsed_time_feats_df

Unnamed: 0,prior_elapsed_time_sum,prior_elapsed_time_diff
0,0.000000,0.000000
1,0.266667,0.266667
2,0.583333,0.050000
3,0.866667,-0.033333
4,1.150000,0.000000
...,...,...
98730327,0.000000,0.000000
98730328,1374.573853,-0.050000
98730329,475.664490,-0.033333
98730330,0.000000,0.000000


In [27]:
%%time
train_user_elapsed_time_feats_df.to_pickle('loop_features/train_user_elapsed_time_feats_df_90M.pkl')
valid_user_elapsed_time_feats_df.to_pickle('loop_features/valid_user_elapsed_time_feats_df_90M.pkl')

CPU times: user 443 ms, sys: 1.31 s, total: 1.75 s
Wall time: 1.75 s


In [28]:
%%time
dill.dump(prior_elapsed_time_sum_dict, open('dicts/prior_elapsed_time_sum_dict_file', 'wb'))
dill.dump(prior_elapsed_time_dict, open('dicts/prior_elapsed_time_dict_file', 'wb'))

CPU times: user 24.8 s, sys: 516 ms, total: 25.3 s
Wall time: 25.3 s


## add_explanation

In [29]:
def add_explanation(df):
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(False).astype(np.int8)
    prior_explanation_sum_arr = np.zeros(len(df), dtype=np.int32)

    for idx, (user_id, content_type_id, prior_question_had_explanation) in enumerate(tqdm(df[['user_id', 'content_type_id', 'prior_question_had_explanation']].values)):
        if content_type_id == 0:
            # add feature
            prior_explanation_sum_arr[idx] = prior_explanation_sum_dict[user_id]
            # update dict
            prior_explanation_sum_dict[user_id] += prior_question_had_explanation
        else:
            pass
    
    prior_explanation_sum_arr = prior_explanation_sum_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'prior_explanation_sum': prior_explanation_sum_arr
    })

    del prior_explanation_sum_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [30]:
%%time
prior_explanation_sum_dict = defaultdict(int)

train_user_explanation_feats_df = add_explanation(train_df)
valid_user_explanation_feats_df = add_explanation(valid_df)

# train_df = add_explanation(train_df)
# valid_df = add_explanation(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 6min 16s, sys: 2.19 s, total: 6min 19s
Wall time: 6min 17s


In [31]:
train_user_explanation_feats_df

Unnamed: 0,prior_explanation_sum
0,0
1,0
2,0
3,0
4,0
...,...
98730327,0
98730328,1906
98730329,1280
98730330,0


In [32]:
%%time
train_user_explanation_feats_df.to_pickle('loop_features/train_user_explanation_feats_df_90M.pkl')
valid_user_explanation_feats_df.to_pickle('loop_features/valid_user_explanation_feats_df_90M.pkl')

CPU times: user 175 ms, sys: 697 ms, total: 872 ms
Wall time: 870 ms


In [33]:
%%time
dill.dump(prior_explanation_sum_dict, open('dicts/prior_explanation_sum_dict_file', 'wb'))

CPU times: user 12.7 s, sys: 334 ms, total: 13 s
Wall time: 13 s


## add_task

In [15]:
def add_task(df):
    task_count_arr = np.zeros(len(df), dtype=np.int32)
    task_content_number_arr = np.zeros(len(df), dtype=np.int8)

    for idx, (task_container_id, user_id) in enumerate(tqdm(df[['task_container_id', 'user_id']].values)):
        if task_container_id == task_dict[user_id]:
            if task_count_arr_dict[user_id] > 0:
                # add feature
                task_count_arr[idx] = task_count_arr_dict[user_id] - 1
                task_content_number_arr[idx] = task_content_number_dict[user_id] + 1
                # update dict
                task_content_number_dict[user_id] += 1
        else:
            # add feature
            task_count_arr[idx] = task_count_arr_dict[user_id]
            task_content_number_arr[idx] = 1
            # update dict
            task_count_arr_dict[user_id] += 1
            task_content_number_dict[user_id] = 1
        # update dict
        task_dict[user_id] = task_container_id

    task_count_arr = task_count_arr[-train_size:]
    task_content_number_arr = task_content_number_arr[-train_size:]

    user_feats_df = pd.DataFrame({
        'task_count': task_count_arr,
        'task_content_number': task_content_number_arr
    })

    del task_count_arr, task_content_number_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [16]:
%%time
task_dict = defaultdict(int)
task_count_arr_dict = defaultdict(int)
task_content_number_dict = defaultdict(int)

train_user_task_feats_df = add_task(train_df)
valid_user_task_feats_df = add_task(valid_df)

# train_df = add_task(train_df)
# valid_df = add_task(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 3min 10s, sys: 1.41 s, total: 3min 12s
Wall time: 3min 11s


In [17]:
train_user_task_feats_df

Unnamed: 0,task_count,task_content_number
0,0,0
1,0,1
2,1,1
3,2,1
4,3,1
...,...,...
98730327,828,1
98730328,1338,1
98730329,1122,1
98730330,887,1


In [18]:
%%time
train_user_task_feats_df.to_pickle('loop_features/train_user_task_feats_df_90M.pkl')
valid_user_task_feats_df.to_pickle('loop_features/valid_user_task_feats_df_90M.pkl')

CPU times: user 188 ms, sys: 860 ms, total: 1.05 s
Wall time: 1.05 s


In [19]:
%%time
dill.dump(task_dict, open('dicts/task_dict_file', 'wb'))
dill.dump(task_count_arr_dict, open('dicts/task_count_arr_dict_file', 'wb'))
dill.dump(task_content_number_dict, open('dicts/task_content_number_dict_file', 'wb'))

CPU times: user 25.5 s, sys: 396 ms, total: 25.9 s
Wall time: 25.9 s


## add_timestamp_diffs

In [14]:
def add_timestamp_diffs(df):
    df = df[-train_size:].reset_index(drop=True) # only features needed in train_size
    
    timestamp_lag1_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag2_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag3_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag4_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_lag5_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff2_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff3_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff4_arr = np.zeros(len(df), dtype=np.float32)
    timestamp_diff5_arr = np.zeros(len(df), dtype=np.float32)

    for idx, (timestamp, user_id) in enumerate(tqdm(df[['timestamp', 'user_id']].values)):
        # add feature
        timestamp_lag1_arr[idx] = timestamp_lag1_dict[user_id]
        timestamp_window_id = np.append(timestamp_window_dict[user_id], timestamp_lag1_dict[user_id])
        timestamp_window_id = timestamp_window_id[-5:]
        
        if len(timestamp_window_id) > 1:
            timestamp_diff_arr[idx] = timestamp/(1000*60) - timestamp_lag1_arr[idx]
        if len(timestamp_window_id) > 2:
            timestamp_lag2_arr[idx] = timestamp_window_id[-2]
            timestamp_diff2_arr[idx] = timestamp/(1000*60) - timestamp_lag2_arr[idx]
        if len(timestamp_window_id) > 3:
            timestamp_lag3_arr[idx] = timestamp_window_id[-3]
            timestamp_diff3_arr[idx] = timestamp/(1000*60) - timestamp_lag3_arr[idx]
        if len(timestamp_window_id) > 4:
            timestamp_lag4_arr[idx] = timestamp_window_id[-4]
            timestamp_diff4_arr[idx] = timestamp/(1000*60) - timestamp_lag4_arr[idx]
            if timestamp_window_id[-5] != 0:
                timestamp_lag5_arr[idx] = timestamp_window_id[-5]
                timestamp_diff5_arr[idx] = timestamp/(1000*60) - timestamp_lag5_arr[idx]

        # update dict
        timestamp_window_dict[user_id] = timestamp_window_id
        timestamp_lag1_dict[user_id] = timestamp/(1000*60)

    user_feats_df = pd.DataFrame({
        'timestamp_lag1': timestamp_lag1_arr,
        'timestamp_lag2': timestamp_lag2_arr,
        'timestamp_lag3': timestamp_lag3_arr,
        'timestamp_lag4': timestamp_lag4_arr,
        'timestamp_lag5': timestamp_lag5_arr,
        'timestamp_diff': timestamp_diff_arr,
        'timestamp_diff2': timestamp_diff2_arr,
        'timestamp_diff3': timestamp_diff3_arr,
        'timestamp_diff4': timestamp_diff4_arr,
        'timestamp_diff5': timestamp_diff5_arr
    })

    del timestamp_lag1_arr, timestamp_lag2_arr, timestamp_lag3_arr, timestamp_lag4_arr, timestamp_lag5_arr, timestamp_diff_arr, timestamp_diff2_arr, timestamp_diff3_arr, timestamp_diff4_arr, timestamp_diff5_arr
    gc.collect()

    # df = df.reset_index(drop=True)
    # df = pd.concat([df, user_feats_df], axis=1)

    return user_feats_df

In [15]:
%%time
timestamp_lag1_dict = defaultdict(float)
timestamp_window_dict = defaultdict(list)

train_user_timestamp_diffs_feats_df = add_timestamp_diffs(train_df)
valid_user_timestamp_diffs_feats_df = add_timestamp_diffs(valid_df)

# train_df = add_timestamp_diffs(train_df)
# valid_df = add_timestamp_diffs(valid_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=98730332.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500000.0), HTML(value='')))


CPU times: user 33min 31s, sys: 10min 2s, total: 43min 34s
Wall time: 30min 19s


In [16]:
train_user_timestamp_diffs_feats_df

Unnamed: 0,timestamp_lag1,timestamp_lag2,timestamp_lag3,timestamp_lag4,timestamp_lag5,timestamp_diff,timestamp_diff2,timestamp_diff3,timestamp_diff4,timestamp_diff5
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.344433,0.000000,0.000000,0.000000,0.000000
2,0.344433,0.000000,0.000000,0.000000,0.000000,0.308433,0.652867,0.000000,0.000000,0.000000
3,0.652867,0.344433,0.000000,0.000000,0.000000,0.317250,0.625683,0.970117,0.000000,0.000000
4,0.970117,0.652867,0.344433,0.000000,0.000000,0.292867,0.610117,0.918550,1.262983,0.000000
...,...,...,...,...,...,...,...,...,...,...
98730327,49742.679688,49741.917969,49741.191406,49735.648438,49734.382812,580.924683,581.686401,582.412964,587.955933,589.221558
98730328,893666.437500,893666.062500,893665.500000,893664.750000,893664.250000,0.820283,1.195283,1.757783,2.507783,3.007783
98730329,17764.958984,17764.453125,17763.734375,17762.912109,17762.626953,0.850232,1.356092,2.074842,2.897107,3.182264
98730330,51983.140625,51977.945312,51898.992188,51897.347656,51895.324219,0.805792,6.001104,84.954231,86.598763,88.622200


In [17]:
%%time
train_user_timestamp_diffs_feats_df.to_pickle('loop_features/train_user_timestamp_diffs_feats_df_90M.pkl')
valid_user_timestamp_diffs_feats_df.to_pickle('loop_features/valid_user_timestamp_diffs_feats_df_90M.pkl')

CPU times: user 1.65 s, sys: 6.96 s, total: 8.61 s
Wall time: 8.59 s


In [18]:
%%time
dill.dump(timestamp_lag1_dict, open('dicts/timestamp_lag1_dict_file', 'wb'))
dill.dump(timestamp_window_dict, open('dicts/timestamp_window_dict_file', 'wb'))

CPU times: user 33 s, sys: 565 ms, total: 33.5 s
Wall time: 33.6 s
