In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from collections import defaultdict
import dill
from tqdm.notebook import tqdm
import lightgbm as lgb

# Load data

In [None]:
%%time
columns = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train_df = pd.read_pickle('train/cv1_train.pickle')[columns]
valid_df = pd.read_pickle('train/cv1_valid.pickle')[columns]

print('train_df size: {}'.format(train_df.shape))
print('valid_df size: {}'.format(valid_df.shape))

train_df size: (98730332, 8)
valid_df size: (2500000, 8)
CPU times: user 2.12 s, sys: 6.7 s, total: 8.82 s
Wall time: 9.1 s


In [None]:
prior_question_elapsed_time_mean = train_df['prior_question_elapsed_time'].dropna().values.mean()
prior_question_elapsed_time_mean

25439.41

# Basic features

## questions basic features

In [None]:
%%time
questions_df = pd.read_csv('questions.csv')

# part_bundle_id
# questions_df['part_bundle_id'] = questions_df['part']*100000 + questions_df['bundle_id']

# tags_number
questions_df['tag'] = questions_df['tags'].str.split(' ')
questions_df = questions_df.explode('tag')
questions_df = pd.merge(
    questions_df,
    questions_df.groupby('question_id')['tag'].count().reset_index(),
    on='question_id'
)
questions_df.drop(columns=['tag_x'], axis=1, inplace=True)
questions_df.rename(columns={'question_id': 'content_id', 'tag_y': 'tags_number'}, inplace=True)
questions_df.drop_duplicates(inplace=True)

# tags
tags_df = questions_df['tags'].str.split(' ', n=10, expand=True)
tags_df.columns = ['tags1', 'tags2', 'tags3', 'tags4', 'tags5', 'tags6']
tags_df.fillna(0, inplace=True)
tags_df = tags_df.astype(np.int16)
questions_df = pd.concat(
    [questions_df, tags_df],
    axis=1
)
questions_df.drop(columns=['correct_answer', 'tags', 'tags5', 'tags6'], axis=1, inplace=True)

questions_df.set_index('content_id', inplace=True)

CPU times: user 164 ms, sys: 3.94 ms, total: 168 ms
Wall time: 176 ms


In [None]:
%%time
# community
cmnts_df = pd.read_csv('basic_&_general_stats_features/question_cmnts.csv')
cmnts_df.columns = ['content_id', 'community']
cmnts_df.set_index('content_id', inplace=True)

questions_df = pd.concat(
    [questions_df, cmnts_df],
    axis=1
)

CPU times: user 4.37 ms, sys: 657 µs, total: 5.03 ms
Wall time: 5.22 ms


In [None]:
questions_df

Unnamed: 0_level_0,bundle_id,part,tags_number,tags1,tags2,tags3,tags4,community
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,4,51,131,162,38,2
1,1,1,3,131,36,81,0,2
2,2,1,4,131,101,162,92,2
3,3,1,4,131,149,162,29,2
4,4,1,4,131,5,162,38,2
...,...,...,...,...,...,...,...,...
13518,13518,5,1,14,0,0,0,0
13519,13519,5,1,8,0,0,0,1
13520,13520,5,1,73,0,0,0,1
13521,13521,5,1,125,0,0,0,0


In [None]:
questions_df.to_pickle('basic_&_general_stats_features/questions_df.pkl')

In [None]:
content_part_dict = questions_df['part'].astype(np.int8).to_dict(defaultdict(int))

In [None]:
dill.dump(content_part_dict, open('dicts/content_part_dict_file', 'wb'))

## lecture basic features

In [None]:
%%time
lectures_df = pd.read_csv('lectures.csv')

lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')
lectures_df.set_index('lecture_id', inplace=True)

CPU times: user 4.26 ms, sys: 4.22 ms, total: 8.48 ms
Wall time: 7.41 ms


In [None]:
lectures_df

Unnamed: 0_level_0,tag,part,type_of
lecture_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
89,159,5,concept
100,70,1,concept
185,45,6,concept
192,79,5,solving_question
317,156,5,solving_question
...,...,...,...
32535,8,5,solving_question
32570,113,3,solving_question
32604,24,6,concept
32625,142,2,concept


In [None]:
lectures_df.to_pickle('basic_&_general_stats_features/lectures_df.pkl')

In [None]:
lecture_part_dict = lectures_df['part'].astype(np.int8).to_dict(defaultdict(int))
lecture_type_dict = lectures_df['type_of'].astype(str).to_dict(defaultdict(str))
# lecture_tags_dict = lectures_df['tag'].astype(np.int16).to_dict(defaultdict(int))

In [None]:
dill.dump(lecture_part_dict, open('dicts/lecture_part_dict_file', 'wb'))
dill.dump(lecture_type_dict, open('dicts/lecture_type_dict_file', 'wb'))

# General stats features

## content stats

In [None]:
def create_content_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    content_df = df.groupby('content_id')['answered_correctly'].agg(['mean', 'std'])
    content_df.columns = ['answered_correctly_content_mean', 'answered_correctly_content_std']
    # answered_correctly_content_mean_dict = content_df['answered_correctly_content_mean'].to_dict(defaultdict(float))
    # answered_correctly_content_std_dict = content_df['answered_correctly_content_std'].to_dict(defaultdict(float))

    return content_df

In [None]:
%%time
content_df = create_content_stats(train_df)

CPU times: user 6.06 s, sys: 3 s, total: 9.06 s
Wall time: 9.06 s


In [None]:
content_df

Unnamed: 0_level_0,answered_correctly_content_mean,answered_correctly_content_std
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.907334,0.289986
1,0.890571,0.312198
2,0.554213,0.497058
3,0.779386,0.414670
4,0.613970,0.486846
...,...,...
13518,0.782936,0.412505
13519,0.564912,0.496059
13520,0.674129,0.468991
13521,0.804483,0.396845


In [None]:
content_df.to_pickle('basic_&_general_stats_features/content_df.pkl')

## content_part stats

In [None]:
def create_content_part_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    df = pd.concat(
        [df, questions_df[['part']].reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    content_part_df = df.groupby('part')['answered_correctly'].agg(['mean', 'std'])
    content_part_df.columns = ['answered_correctly_content_part_mean', 'answered_correctly_content_part_std']
    # answered_correctly_content_part_mean_dict = content_part_df['answered_correctly_content_part_mean'].to_dict(defaultdict(float))
    # answered_correctly_content_part_std_dict = content_part_df['answered_correctly_content_part_std'].to_dict(defaultdict(float))

    return content_part_df

In [None]:
%%time
content_part_df = create_content_part_stats(train_df)

CPU times: user 7.25 s, sys: 4.3 s, total: 11.6 s
Wall time: 11.6 s


In [None]:
content_part_df

Unnamed: 0_level_0,answered_correctly_content_part_mean,answered_correctly_content_part_std
part,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.745595,0.435526
2,0.708819,0.454307
3,0.701768,0.457482
4,0.631863,0.482299
5,0.61047,0.487644
6,0.669613,0.470352
7,0.660317,0.473601


In [None]:
content_part_df.to_pickle('basic_&_general_stats_features/content_part_df.pkl')

## content_bundle stats (new)

In [None]:
def create_content_bundle_count_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    df = pd.concat(
        [df, questions_df[['bundle_id']].reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    df['bundle_user_count'] = df.groupby('user_id')['bundle_id'].cumcount()
    bins = [0, 30] + [125, 487, 1451] + [np.inf]
    df['bundle_user_count_bins'] = pd.cut(df['bundle_user_count'], bins=bins, labels=[0, 1, 2, 3, 4], include_lowest=True)
    content_bundle_count_bins_df = df.groupby('bundle_user_count_bins')['answered_correctly'].agg(['mean', 'std'])
    content_bundle_count_bins_df.columns = ['answered_correctly_content_bundle_count_mean', 'answered_correctly_content_bundle_count_std']

    return content_bundle_count_bins_df

In [None]:
%%time
content_bundle_bins_df = create_content_bundle_count_stats(train_df)

CPU times: user 15.8 s, sys: 7.54 s, total: 23.4 s
Wall time: 23.4 s


In [None]:
content_bundle_bins_df

Unnamed: 0_level_0,answered_correctly_content_bundle_count_mean,answered_correctly_content_bundle_count_std
bundle_user_count_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.510532,0.499889
1,0.645442,0.478379
2,0.662036,0.473016
3,0.678817,0.466931
4,0.703084,0.456899


In [None]:
content_bundle_bins_df.to_pickle('basic_&_general_stats_features/content_bundle_bins_df.pkl')

## content_tags stats

In [None]:
def create_content_tags_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    df = pd.concat(
        [df, questions_df[['tags1', 'tags2', 'tags3', 'tags4']].reindex(df['content_id']).reset_index(drop=True)],
        axis=1
    )
    content_tags1_df = df.groupby('tags1')['answered_correctly'].agg(['mean', 'std'])
    content_tags1_df.columns = ['answered_correctly_content_tags1_mean', 'answered_correctly_content_tags1_std']
    # content_tags1_tags2_df = df.groupby(['tags1', 'tags2'])['answered_correctly'].agg(['mean', 'std'])
    # content_tags1_tags2_df.columns = ['answered_correctly_content_tags1_tags2_mean', 'answered_correctly_content_tags1_tags2_std']

    return content_tags1_df

In [None]:
%%time
content_tags1_df = create_content_tags_stats(train_df)

CPU times: user 7.22 s, sys: 4.72 s, total: 11.9 s
Wall time: 11.9 s


In [None]:
content_tags1_df

Unnamed: 0_level_0,answered_correctly_content_tags1_mean,answered_correctly_content_tags1_std
tags1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.000000,
1,0.608464,0.488094
2,0.690732,0.462192
4,0.641899,0.479442
7,0.623096,0.484611
...,...,...
179,0.640975,0.479715
180,0.661048,0.473354
181,0.630708,0.482613
182,0.697525,0.459331


In [None]:
content_tags1_df.to_pickle('basic_&_general_stats_features/content_tags1_df.pkl')

## content_prior_explanation stats

In [None]:
def create_content_prior_explanation_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    content_prior_explanation_df = df.groupby(['content_id', 'prior_question_had_explanation'])['answered_correctly'].agg(['mean'])
    content_prior_explanation_df = content_prior_explanation_df.unstack()
    content_prior_explanation_df.columns = ['content_prior_explanation_wrong_mean', 'content_prior_explanation_correct_mean']
    content_prior_explanation_df.fillna(0.62, inplace=True)

    return content_prior_explanation_df

In [None]:
%%time
content_prior_explanation_df = create_content_prior_explanation_stats(train_df)

CPU times: user 7.85 s, sys: 4.26 s, total: 12.1 s
Wall time: 12.1 s


In [None]:
content_prior_explanation_df

Unnamed: 0_level_0,content_prior_explanation_wrong_mean,content_prior_explanation_correct_mean
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.826590,0.911650
1,0.817518,0.891975
2,0.491877,0.562045
3,0.685307,0.783396
4,0.567858,0.654478
...,...,...
13518,1.000000,0.782663
13519,0.800000,0.563529
13520,1.000000,0.672500
13521,0.666667,0.805000


In [None]:
content_prior_explanation_df.to_pickle('basic_&_general_stats_features/content_prior_explanation_df.pkl')

## content_elapsed_time stats

In [None]:
def create_content_elapsed_time_stats(df):
    df = df[df['answered_correctly'] != -1].reset_index(drop=True)
    df['question_elapsed_time'] = df.groupby('user_id')['prior_question_elapsed_time'].shift(-1)
    content_elapsed_time_df = df.groupby(['content_id', 'answered_correctly'])['prior_question_elapsed_time', 'question_elapsed_time'].agg(['mean'])
    content_elapsed_time_df = content_elapsed_time_df.unstack()
    content_elapsed_time_df.columns = ['prior_question_elapsed_time_wrong_mean', 'prior_question_elapsed_time_correct_mean', 'question_elapsed_time_wrong_mean', 'question_elapsed_time_correct_mean']
    content_elapsed_time_df = content_elapsed_time_df / (1000*60)
    
    return content_elapsed_time_df

In [None]:
%%time
content_elapsed_time_df = create_content_elapsed_time_stats(train_df)

  after removing the cwd from sys.path.


CPU times: user 12.1 s, sys: 6.02 s, total: 18.1 s
Wall time: 18.1 s


In [None]:
content_elapsed_time_df

Unnamed: 0_level_0,prior_question_elapsed_time_wrong_mean,prior_question_elapsed_time_correct_mean,question_elapsed_time_wrong_mean,question_elapsed_time_correct_mean
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.386122,0.362959,0.366613,0.324275
1,0.400535,0.364682,0.368852,0.316512
2,0.400626,0.385810,0.430286,0.392774
3,0.388890,0.388886,0.389679,0.352088
4,0.386138,0.384978,0.383381,0.357598
...,...,...,...,...
13518,0.439595,0.419338,0.316765,0.260594
13519,0.472808,0.411102,0.592076,0.429707
13520,0.406790,0.404412,0.547795,0.485759
13521,0.392578,0.414047,0.372727,0.346446


In [None]:
content_elapsed_time_df.to_pickle('basic_&_general_stats_features/content_elapsed_time_df.pkl')