In [1]:
import pandas as pd, warnings
import dask.dataframe as dd
import matplotlib.pyplot as plt

warnings.simplefilter('ignore')
%matplotlib inline

In [2]:
%%time
train = dd.read_csv('data/train.csv', dtype={'row_id': 'int64',
                              'timestamp': 'int64',
                              'user_id': 'int32',
                              'content_id': 'str',
                              'content_type_id': 'int8',
                              'task_container_id': 'int16',
                              'user_answer': 'int8',
                              'answered_correctly': 'int8',
                              'prior_question_elapsed_time': 'float32', 
                              'prior_question_had_explanation': 'boolean',
                             })
del train['row_id']

def Normalizer(dd, const_names):
    """
    Normalized constant columns in a dask dataframe.
    """
    for name in const_names:
        if name in ['timestamp', 'prior_question_elapsed_time']:
            dd[name] = (dd[name] - dd[name].min()) / (dd[name].max() - dd[name].min())
        else:
            mean, std = dd[name].mean().compute(), dd[name].std().compute()
            dd[name] = (dd[name] - mean) / (1e-7 + std)
    return dd


def description(df):
    """
    Return basic statistics.
    """
    summary = pd.DataFrame(df.dtypes, columns=['dtypes']).reset_index()
    summary = summary.rename(columns={'index':'name'})
    summary['Missing'] = df.isnull().sum().values
    summary['nunique'] = df.nunique().values
    summary['unique'] = df.apply(lambda x: list(x.unique()), axis=0).reset_index()[0]
    return summary

test, lectures, questions = dd.read_csv('data/example_test.csv'), dd.read_csv('data/lectures.csv'),\
                            dd.read_csv('data/questions.csv', dtype={'questions_id':'str'})
print('Train shape: {0}\nTest shape: {1}'.format(train.shape[0].compute(), test.shape[0].compute()))

Train shape: 101230332
Test shape: 104
CPU times: user 2min 57s, sys: 15.7 s, total: 3min 12s
Wall time: 2min 21s


In [3]:
%%time
description(train.compute())

CPU times: user 4min 27s, sys: 32 s, total: 4min 59s
Wall time: 4min 7s


Unnamed: 0,name,dtypes,Missing,nunique,unique
0,timestamp,int64,0,72821015,"[0, 56943, 118363, 131167, 137965, 157063, 176..."
1,user_id,int32,0,393656,"[115, 124, 2746, 5382, 8623, 8701, 12741, 1313..."
2,content_id,object,0,13782,"[5692, 5716, 128, 7860, 7922, 156, 51, 50, 789..."
3,content_type_id,int8,0,2,"[0, 1]"
4,task_container_id,int16,0,10000,"[1, 2, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
5,user_answer,int8,0,5,"[3, 2, 0, 1, -1]"
6,answered_correctly,int8,0,3,"[1, 0, -1]"
7,prior_question_elapsed_time,float32,2351538,3258,"[nan, 37000.0, 55000.0, 19000.0, 11000.0, 5000..."
8,prior_question_had_explanation,boolean,392506,2,"[<NA>, False, True]"


In [4]:
%%time
train = train.map_partitions(lambda x: x[~x.index.isin(x.loc[x.user_answer==-1].index)]).reset_index(drop=True)
questions = questions.rename(columns={'question_id':'content_id'}).set_index('content_id')
lectures = lectures.rename(columns={'lecture_id':'content_id'}).set_index('content_id')
train.content_id, questions.index, lectures.index = train.content_id.astype('str'), questions.index.astype('str'),\
                                                    lectures.index.astype('str')


train = train.join(lectures, on='content_id')
train = train.join(questions, on='content_id', rsuffix='_Qu')
train = train.drop(['tag', 'part', 'type_of', 'content_type_id', 'correct_answer'], axis=1)

train.head(5)

CPU times: user 3.01 s, sys: 63.7 ms, total: 3.07 s
Wall time: 3.03 s


Unnamed: 0,timestamp,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,part_Qu,tags
0,0,115,5692,1,3,1,,,5692,5,151
1,56943,115,5716,2,2,1,37000.0,False,5716,5,168
2,118363,115,128,0,0,1,55000.0,False,128,1,131 149 92
3,131167,115,7860,3,0,1,19000.0,False,7860,1,131 104 81
4,137965,115,7922,4,1,1,11000.0,False,7922,1,131 149 92


In [5]:
dep_var = 'answered_correctly' #'user_answer'
cat_names = ['part_Qu', 'bundle_id', 'content_id', 'task_container_id', 'tags']
const_names = ['timestamp', 'prior_question_elapsed_time', 'prior_question_had_explanation']

In [6]:
train = train.categorize(columns = cat_names)
train = dd.get_dummies(train, columns = ['part_Qu'], prefix = ['part'])
train = train.dropna()
train = Normalizer(train, const_names[:2])
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype(int)

In [7]:
%%time
description(train.compute())

CPU times: user 7min 11s, sys: 33.1 s, total: 7min 44s
Wall time: 6min 17s


Unnamed: 0,name,dtypes,Missing,nunique,unique
0,timestamp,float64,0,70897330,"[6.513296784852509e-07, 1.3538685129787637e-06..."
1,user_id,int32,0,393569,"[115, 124, 2746, 5382, 8623, 8701, 12741, 1313..."
2,content_id,category,0,13522,"[5716, 128, 7860, 7922, 156, 51, 50, 7896, 786..."
3,task_container_id,category,0,10000,"[2, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,user_answer,int8,0,4,"[2, 0, 1, 3]"
5,answered_correctly,int8,0,2,"[1, 0]"
6,prior_question_elapsed_time,float32,0,3258,"[0.123333335, 0.18333334, 0.06333333, 0.036666..."
7,prior_question_had_explanation,int64,0,2,"[0, 1]"
8,bundle_id,category,0,9764,"[5716, 128, 7860, 7922, 156, 51, 50, 7896, 786..."
9,tags,category,0,1519,"[168, 131 149 92, 131 104 81, 131 101 162 38, ..."


In [None]:
train = train.shuffle(on=train.index)
#train = train.reset_index(drop=True)

In [8]:
dim_x = 98878793
df = train.loc[0:dim_x//10e3]

In [None]:
train.head()

In [12]:
train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'user_answer', 'answered_correctly', 'prior_question_elapsed_time',
       'prior_question_had_explanation', 'bundle_id', 'tags', 'part_5',
       'part_1', 'part_2', 'part_3', 'part_4', 'part_6', 'part_7'],
      dtype='object')

In [13]:
%%time
from dask_ml.xgboost import XGBRegressor
from dask.distributed import Client
client = Client()
#from dask_ml.model_selection import train_test_split

# ['tags', 'correct_answer',  'bundle_id',  'user_id',  'content_id', 'task_container_id']
var = ['timestamp', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'part_5',
                                                     'part_1', 'part_2', 'part_3', 'part_4', 'part_6', 'part_7']

xgb_train =  df[var]
xgb_label = df[dep_var]

xgb = XGBRegressor()
xgb.fit(xgb_train, xgb_label)

CPU times: user 17.5 s, sys: 2.45 s, total: 20 s
Wall time: 4min 2s


XGBRegressor()

In [14]:
dir(xgb)

['_Booster',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'apply',
 'base_score',
 'booster',
 'coef_',
 'colsample_bylevel',
 'colsample_bynode',
 'colsample_bytree',
 'evals_result',
 'evals_result_',
 'feature_importances_',
 'fit',
 'gamma',
 'get_booster',
 'get_num_boosting_rounds',
 'get_params',
 'get_xgb_params',
 'importance_type',
 'intercept_',
 'kwargs',
 'learning_rate',
 'load_model',
 'max_delta_step',
 'max_depth',
 'min_child_weight',
 'missing',
 

In [15]:
df = pd.DataFrame({'value' : abs(xgb.feature_importances_), 'name' : xgb_train.columns.tolist()})
features = df.loc[df.value!=0.0].sort_values('value', ascending=False).reset_index(drop=True)
features

Unnamed: 0,value,name
0,0.352915,prior_question_had_explanation
1,0.305106,part_5
2,0.067282,part_1
3,0.059828,timestamp
4,0.049098,part_2
5,0.041194,prior_question_elapsed_time
6,0.039743,part_4
7,0.031431,part_3
8,0.028679,part_6
9,0.024723,part_7


In [16]:
test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [17]:
%%time
test.content_id = test.content_id.astype('str')
test = test.join(lectures, on='content_id')
test = test.join(questions, on='content_id', rsuffix='_Qu')

test = test.drop(['tag', 'part', 'type_of', 'content_type_id', 'correct_answer', 'prior_group_answers_correct',\
                 'prior_group_responses'], axis=1)
test = test.dropna()
test = Normalizer(test, const_names[:2])

test = test.categorize(columns = cat_names)
test = dd.get_dummies(test, columns = ['part_Qu'], prefix = ['part'])

test['prior_question_had_explanation'] = test['prior_question_had_explanation'].astype(int)

description(test.compute())

CPU times: user 248 ms, sys: 34.7 ms, total: 283 ms
Wall time: 643 ms


Unnamed: 0,name,dtypes,Missing,nunique,unique
0,row_id,int64,0,103,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,group_num,int64,0,4,"[0, 1, 2, 3]"
2,timestamp,float64,0,73,"[0.17326858396747905, 0.054853474957678366, 0...."
3,user_id,int64,0,42,"[554169193, 1720860329, 288641214, 1728340777,..."
4,content_id,category,0,102,"[12010, 457, 13262, 6119, 12023, 574, 12043, 7..."
5,task_container_id,category,0,73,"[4427, 240, 266, 162, 4424, 1367, 5314, 532, 3..."
6,prior_question_elapsed_time,float64,0,34,"[0.1834862385321101, 0.1529051987767584, 0.244..."
7,prior_question_had_explanation,int64,0,2,"[1, 0]"
8,bundle_id,category,0,72,"[12010, 457, 13262, 6119, 12023, 574, 12043, 7..."
9,tags,category,0,83,"[90 100 92 29, 143 105 38 29, 96, 54, 2 107 92..."


In [18]:
xgb.predict(test[var]).compute()

array([0.6799915 , 0.69161564, 0.5523175 , 0.60088956, 0.68398744,
       0.6954618 , 0.7209495 , 0.7417885 , 0.74133605, 0.6900002 ,
       0.6823852 , 0.6823852 , 0.6823852 , 0.6823852 , 0.6254679 ,
       0.72648275, 0.6954618 , 0.53159076, 0.6964716 , 0.6964716 ,
       0.6964716 , 0.57141596, 0.6814625 , 0.6814625 , 0.6814625 ,
       0.60434484, 0.6902948 , 0.6902948 , 0.6902948 , 0.6611929 ,
       0.6611929 , 0.6611929 , 0.6611929 , 0.5523175 , 0.60414743,
       0.60220283, 0.7067472 , 0.7209495 , 0.72648275, 0.6757931 ,
       0.6757931 , 0.6757931 , 0.6757931 , 0.60414743, 0.52886415,
       0.6954618 , 0.59220475, 0.6813247 , 0.72054535, 0.74133605,
       0.6414521 , 0.5839466 , 0.6924015 , 0.6757931 , 0.6757931 ,
       0.6757931 , 0.6757931 , 0.6954618 , 0.69161564, 0.60220283,
       0.7401168 , 0.57141596, 0.7209495 , 0.60220283, 0.7209495 ,
       0.5766385 , 0.68398744, 0.69322085, 0.60414743, 0.6407553 ,
       0.42247733, 0.72514945, 0.6700618 , 0.6700618 , 0.67006

In [28]:
submission = pd.DataFrame({'row_id': test.row_id.compute(), 'answered_correctly': xgb.predict(test[var]).compute(),
                            'group_num': test.group_num.compute()})
submission.to_csv('data/submission.csv', index = None)