In [44]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}
import pandas as pd
import gcsfs
import feather
from embedder.regression import Embedder
from embedder.preprocessing import (categorize,
     pick_emb_dim,  get_embed_df)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
import numpy as np
import joblib
import pickle


fs = gcsfs.GCSFileSystem(token=r"C:\Users\meipaopao\PycharmProjects\ktracing_gcs.json")

def save_to_feather(file_name="validation-v0-00000000000", output_file_name="validation_v0", max_=30):
    dataset = pd.DataFrame()
    for i in range(max_):
        file_path = f'gs://ktracing/{file_name}{i}.csv'
        if not fs.exists(file_path):
            break
        with fs.open(file_path) as f:
            print(i)
            df = pd.read_csv(f, dtype=dtypes, index_col=False)
            print('current shape:', df.shape)
            dataset = pd.concat([dataset, df])
            print('updated shape:', dataset.shape)
    print('overall shape:', dataset.shape)
    dataset.reset_index(drop=True).to_feather(f'{output_file_name}.feather')
# gs://ktracing/train-v0-000000000000.csv
# https://storage.cloud.google.com/ktracing/train_sample_000000000000.csv?authuser=1
#save_to_feather(file_name="train-v0-00000000000", output_file_name="train_v0")
#save_to_feather(file_name="train_sample_00000000000", output_file_name="train_sample_v0")
# gs://ktracing/train_sample_v0_000000000000.csv
#save_to_feather(file_name="train_sample_v0_00000000000", output_file_name="train_sample_v1")

Wall time: 0 ns


In [52]:
%%time

from tqdm import tqdm as tqdm_notebook
seq_len = 2
# generate train sample indices
def get_sample_indices(df_):
    df = df_[df_.content_type_id==False]
    print(df['answered_correctly'].unique())
    sample_indices = []
    user_indices = []
    df_users = df.groupby('user_id').groups
    for user_idx, start_indices in enumerate(df_users.values()):
        #start_indices = one_df.index
        #sample_indices.append(one_df.index.tolist())
        #user_indices.append(user_idx)
        assert df.iloc[start_indices]['answered_correctly'].nunique()<=2
        for num, curr_index in enumerate(start_indices):
            selected_index = start_indices[max(0,num-seq_len+1):num+1]
            if selected_index[-1] != curr_index:
                print('not matched!')

            #print('y:', df[['answered_correctly']].iloc[curr_index[-1]].values)
            sample_indices.append(selected_index)
            user_indices.append(user_idx)
#             uniques = df_[['answered_correctly']].iloc[curr_index]['answered_correctly'].unique()
#             if len(uniques)>2:
#                 print('unique: ', df_[['answered_correctly']].iloc[curr_index]['answered_correctly'].unique())
    return sample_indices, user_indices

# data load
print('loading data')
train_df = feather.read_dataframe('train_sample_v1.feather')
print('shape:', train_df.shape)
#test_df = feather.read_dataframe('validation_v0.feather')
train_samples, train_users = get_sample_indices(train_df)

print('size:', len(train_samples))

loading data
shape: (201205, 10)
[1 0]
size: 196786
Wall time: 2.07 s


In [53]:
import torch
pd.options.mode.chained_assignment = None
# ds = train_df['user_id'].value_counts().reset_index()
# ds.columns = ['user_id', 'count']
# ds = ds.sort_values(['count'])
# print(ds['user_id'].values[-1])
# df_ = train_df[train_df['user_id'] == 870330384]
def get_mappers(df_, cate_cols):

    mappers_dict = {}
    cate_offset = 1
    for col in (cate_cols):    
        cate2idx = {}
        for v in df_[col].unique():
            if (v != v) | (v == None): continue 
            cate2idx[v] = len(cate2idx)+cate_offset
        mappers_dict[col] = cate2idx    
        df_.loc[:,col] = df_[col].map(cate2idx).fillna(0).astype(int)
        cate_offset += len(cate2idx)
    return mappers_dict, cate_offset

def agg_data(df_, cate_cols, mappers_dict, cont_cols = ['prior_question_elapsed_time','lagged_time']):
    df_.loc[:, 'lagged_time'] = df_[['user_id', 'timestamp']].groupby('user_id')['timestamp'].diff()
    print(df_[['prior_question_elapsed_time','lagged_time']].head())

    for col in (cate_cols):    
        cate2idx = mappers_dict[col]
        df_.loc[:,col] = df_[col].map(cate2idx).fillna(0).astype(int)
        
    for col in cont_cols:
        df_.loc[:, col] = df_[col].fillna(0)
    
    return df_


cate_cols = ['content_id']
cont_cols = ['prior_question_elapsed_time','lagged_time']
print('shape: ', train_df.shape)
mappers_dict, cate_offset = get_mappers(train_df, cate_cols)

train_df = agg_data(train_df, cate_cols, mappers_dict)
print('head:', train_df.head())
print('shape: ', train_df.shape)
torch.save([train_samples, train_users, train_df, mappers_dict, cate_offset, cate_cols, cont_cols],
           'ktracing_train.pt')



# cate_df = train_df[cate_cols]
# indices = train_samples[0]
# print('head:', cate_df.head())
# print(cate_df.iloc[indices].values)

shape:  (201205, 10)
   prior_question_elapsed_time  lagged_time
0                      22000.0          NaN
1                      17000.0          NaN
2                      19000.0          NaN
3                      26000.0          NaN
4                      21000.0          NaN
head:      row_id    timestamp    user_id  content_id  content_type_id  \
0  31408421    514367405  674040280          34            False   
1  24623124   1566292394  526948179          34            False   
2  28758973   1604921708  614907064          34            False   
3   2260794    781984829   47975614          34            False   
4  16864344  14449903373  362401676          34            False   

   task_container_id  user_answer  answered_correctly  \
0                254            0                   1   
1                 33            0                   1   
2                487            0                   1   
3                540            0                   1   
4              

In [12]:
%%time
import feather
df = feather.read_dataframe('validation_v0.feather')
df.shape
print(df.head())

     row_id    timestamp    user_id  content_id  content_type_id  \
0  36988923    734303484  790098971        6940            False   
1  27772579  22000097282  592529342        6566            False   
2  20422501  16366712289  438327923       10742            False   
3  14131213   8242160735  305936298       10032            False   
4  41855831   7195910010  887942868        8447            False   

   task_container_id  user_answer  answered_correctly  \
0                247            0                   0   
1                797            0                   0   
2               1416            0                   0   
3               2199            0                   0   
4                111            0                   0   

   prior_question_elapsed_time  prior_question_had_explanation  
0                       2250.0                            True  
1                      61250.0                            True  
2                      31250.0                       

In [2]:

def summary_statistics(train):
    results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
    results_c.columns = ["answered_correctly_content"]
    results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
    results_u.columns = ["answered_correctly_user", 'sum']
    results_c.to_pickle('results_c.pkl')
    results_u.to_pickle('results_u.pkl')
    return results_c, results_u

def encode_categorical(X,
                       cols=None,
                       categorical_vars=None,
                       copy=True):
    '''
    Encode categorical variables as integers.

    :param X: input DataFrame
    :param categorical_vars: optional, list of categorical variables
    :param copy: optional, whether to modify a copy
    :return: DataFrame, LabelEncoders
    '''
    df = X.copy() if copy else X
    encoders = {}

    if not cols:
        cols = [col for col in X.columns if X[col].dtype == 'object']

    if categorical_vars is None:
        categorical_vars = [col for col in df.columns if col in cols]

    for var in categorical_vars:
        encoders[var] = SafeLabelEncoder()
        encoders[var].fit(df[var])
        df.loc[:, var] = encoders[var].transform(df.loc[:, var])

    return df, encoders

class SafeLabelEncoder(LabelEncoder):
    """An extension of LabelEncoder that will
    not throw an exception for unseen data, but will
    instead return a default value of len(labels)

    Attributes
    ----------

    classes_ : the classes that are encoded
    """

    def transform(self, y):

        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)

        unseen = len(self.classes_)

        e = np.array([
                     np.searchsorted(self.classes_, x)
                     if x in self.classes_ else unseen
                     for x in y
                     ])

        if unseen in e:
            self.classes_ = np.array(self.classes_.tolist() + ['unseen'])

        return e
    
def get_embdded_df(data, target='answered_correctly', cols=['content_id']):
    cat_vars = categorize(data, cols=cols)
    embedding_dict = pick_emb_dim(cat_vars, max_dim=20)
    data_x, data_y = data.drop([target],axis=1), data[target]
    data_x_encoded, encoders = encode_categorical(data_x, cols=cols)
    # embedding training
    embedder = Embedder(embedding_dict, model_json=None)
    embedder.fit(data_x_encoded[cols], data_y, epochs=1)
    embeddings = embedder.get_embeddings()
    return embeddings, encoders

def add_embedding(data,encoders, embeddings):
    return get_embed_df(data, encoders, embeddings)

def get_train_data(df, target = 'answered_correctly'):
    train = df[df.content_type_id == False]
    results_c, results_u = summary_statistics(train)

    train = pd.merge(train, results_u, on=['user_id'], how="left")
    train = pd.merge(train, results_c, on=['content_id'], how="left")

    X = train.drop([target], axis=1)
    X['answered_correctly_user'].fillna(0.5,  inplace=True)
    X['answered_correctly_content'].fillna(0.5,  inplace=True)
    X.fillna(0, inplace = True)
    Y = train[["answered_correctly"]]
    return X, Y

In [3]:
%%time 

X, Y = get_train_data(df)
embeddings, encoders = get_embdded_df(df[df.content_type_id == False], cols=['content_id'])
X_embedded = add_embedding(X,encoders, embeddings)

print(embeddings['content_id'].shape)
print(df.shape)
print(X_embedded.shape)
print(X_embedded.columns.to_list())

(13500, 20)
(6834063, 10)
(6700354, 31)
['row_id', 'timestamp', 'user_id', 'content_type_id', 'task_container_id', 'user_answer', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'answered_correctly_user', 'sum', 'answered_correctly_content', 'embed_content_id0', 'embed_content_id1', 'embed_content_id2', 'embed_content_id3', 'embed_content_id4', 'embed_content_id5', 'embed_content_id6', 'embed_content_id7', 'embed_content_id8', 'embed_content_id9', 'embed_content_id10', 'embed_content_id11', 'embed_content_id12', 'embed_content_id13', 'embed_content_id14', 'embed_content_id15', 'embed_content_id16', 'embed_content_id17', 'embed_content_id18', 'embed_content_id19']
Wall time: 4min 52s


In [4]:
%%time

features= [ 'answered_correctly_user', 'sum', 'answered_correctly_content'] +[col for col in X_embedded.columns if col.startswith('embed_')]

param = {'num_leaves': 50, 'learning_rate': 0.1, 'subsample_for_bin': 130000, 'min_child_samples': 470, 'reg_alpha': 0.5, 
         'reg_lambda': 0.26, 'subsample': 0.5, 'is_unbalance': False, 'n_estimators': 1000, 'objective': 'binary', 'random_state': 126}

import lightgbm as lgb

model = lgb.LGBMClassifier(**param)
import numpy as np
model.fit(X_embedded[features], Y)


# save model
joblib.dump(model, 'lgb_with_embed_v0.pkl')

  return f(**kwargs)


Wall time: 3min 49s


['lgb_with_embed_v0.pkl']

In [8]:
%%time

with fs.open('ktracing/validation_v0-000000000001.csv') as f:
    df_val = pd.read_csv(f, dtype=dtypes)
X_val, Y_val = get_train_data(df_val)

from sklearn.metrics import roc_auc_score

X_val_embedded = add_embedding(X_val,encoders, embeddings)


features= [ 'answered_correctly_user', 'sum', 'answered_correctly_content'] + [col for col in X_val_embedded.columns if col.startswith('embed_')]


Y_pred = model.predict_proba(X_val_embedded[features])[:, 1]

print('score:', roc_auc_score(Y_val, Y_pred))
print('pred:', Y_pred.mean())
print('true:', Y_val.mean())

score: 0.7941278924945345
pred: 0.6741167078835231
true: answered_correctly    0.667531
dtype: float64
Wall time: 2min 12s


In [6]:

# joblib.dump(model,open('lgb.pkl', 'wb') )
pickle.dump(df_val,open('validation_v0.pkl', 'wb') )
pickle.dump(encoders,open('encoders_content_id_v0.pkl', 'wb') )
pickle.dump(embeddings,open('embeddings_content_id_v0.pkl', 'wb') )