In [1]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}
import pandas as pd
import gcsfs
fs = gcsfs.GCSFileSystem(token=r"C:\Users\meipaopao\PycharmProjects\ktracing_gcs.json")

#with fs.open('gs://ktracing/train_v0-000000000000.csv') as f:
#    df = pd.read_csv(f, dtype=dtypes)

df = pd.read_pickle('train_v0.pkl')
print(df.head())

     row_id    timestamp    user_id  content_id  content_type_id  \
0   8397970   4262102706  182409185        6453            False   
1  20812912    517620787  446779249        2559            False   
2  36137884    806437369  771671654        6566            False   
3   8965114  40535648423  195283080        6191            False   
4  19657114       943355  422291583        7217            False   

   task_container_id  user_answer  answered_correctly  \
0                540            0                   0   
1                109            0                   0   
2                476            0                   0   
3                127            0                   0   
4                 15            0                   0   

   prior_question_elapsed_time  prior_question_had_explanation  
0                       3000.0                            True  
1                      31667.0                            True  
2                      37250.0                       

In [2]:

from embedder.regression import Embedder
from embedder.preprocessing import (categorize,
     pick_emb_dim, encode_categorical, get_embed_df)

def summary_statistics(train):
    results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
    results_c.columns = ["answered_correctly_content"]
    results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
    results_u.columns = ["answered_correctly_user", 'sum']
    results_c.to_pickle('results_c.pkl')
    results_u.to_pickle('results_u.pkl')
    return results_c, results_u

def get_embdded_df(data, target='answered_correctly', cols=['content_id']):
    cat_vars = categorize(data, cols=cols)
    embedding_dict = pick_emb_dim(cat_vars, max_dim=20)
    data_x, data_y = data.drop([target],axis=1), data[target]
    data_x_encoded, encoders = encode_categorical(data_x, cols=cols)
    # embedding training
    embedder = Embedder(embedding_dict, model_json=None)
    embedder.fit(data_x_encoded[cols], data_y, epochs=1)
    embeddings = embedder.get_embeddings()
    return embeddings, encoders

def add_embedding(data,encoders, embeddings):
    return get_embed_df(data, encoders, embeddings)

def get_train_data(df, target = 'answered_correctly'):
    train = df[df.content_type_id == False]
    results_c, results_u = summary_statistics(train)

    train = pd.merge(train, results_u, on=['user_id'], how="left")
    train = pd.merge(train, results_c, on=['content_id'], how="left")

    X = train.drop([target], axis=1)
    X['answered_correctly_user'].fillna(0.5,  inplace=True)
    X['answered_correctly_content'].fillna(0.5,  inplace=True)
    X.fillna(0, inplace = True)
    Y = train[["answered_correctly"]]
    return X, Y

In [3]:
%%time

X, Y = get_train_data(df)

print(X.head())

     row_id    timestamp    user_id  content_id  content_type_id  \
0   8397970   4262102706  182409185        6453            False   
1  20812912    517620787  446779249        2559            False   
2  36137884    806437369  771671654        6566            False   
3   8965114  40535648423  195283080        6191            False   
4  19657114       943355  422291583        7217            False   

   task_container_id  user_answer  prior_question_elapsed_time  \
0                540            0                       3000.0   
1                109            0                      31667.0   
2                476            0                      37250.0   
3                127            0                      66000.0   
4                 15            0                      45000.0   

   prior_question_had_explanation  answered_correctly_user    sum  \
0                            True                 0.575758   38.0   
1                            True                 0.6500

In [4]:
%%time 

embeddings, encoders = get_embdded_df(df[df.content_type_id == False], cols=['content_id'])
X_embedded = add_embedding(X,encoders, embeddings)

Wall time: 4min 49s


In [5]:
print(embeddings['content_id'].shape)
print(df.shape)
print(X_embedded.shape)
print(X_embedded.columns.to_list())


(13500, 20)
(6834063, 10)
(6700354, 31)
['row_id', 'timestamp', 'user_id', 'content_type_id', 'task_container_id', 'user_answer', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'answered_correctly_user', 'sum', 'answered_correctly_content', 'embed_content_id0', 'embed_content_id1', 'embed_content_id2', 'embed_content_id3', 'embed_content_id4', 'embed_content_id5', 'embed_content_id6', 'embed_content_id7', 'embed_content_id8', 'embed_content_id9', 'embed_content_id10', 'embed_content_id11', 'embed_content_id12', 'embed_content_id13', 'embed_content_id14', 'embed_content_id15', 'embed_content_id16', 'embed_content_id17', 'embed_content_id18', 'embed_content_id19']


In [6]:
%%time

features= [ 'answered_correctly_user', 'sum', 'answered_correctly_content'] +[col for col in X_embedded.columns if col.startswith('embed_')]

param = {'num_leaves': 50, 'learning_rate': 0.1, 'subsample_for_bin': 130000, 'min_child_samples': 470, 'reg_alpha': 0.5, 
         'reg_lambda': 0.26, 'subsample': 0.5, 'is_unbalance': False, 'n_estimators': 1000, 'objective': 'binary', 'random_state': 126}

import lightgbm as lgb

model = lgb.LGBMClassifier(**param)
import numpy as np
model.fit(X_embedded[features], Y)

print(df.head())

import joblib
# save model
joblib.dump(model, 'lgb_with_embed_v0.pkl')

#print(roc_auc_score(y_true, y_pred))


  return f(**kwargs)


     row_id    timestamp    user_id  content_id  content_type_id  \
0   8397970   4262102706  182409185        6453            False   
1  20812912    517620787  446779249        2559            False   
2  36137884    806437369  771671654        6566            False   
3   8965114  40535648423  195283080        6191            False   
4  19657114       943355  422291583        7217            False   

   task_container_id  user_answer  answered_correctly  \
0                540            0                   0   
1                109            0                   0   
2                476            0                   0   
3                127            0                   0   
4                 15            0                   0   

   prior_question_elapsed_time  prior_question_had_explanation  
0                       3000.0                            True  
1                      31667.0                            True  
2                      37250.0                       

['lgb_with_embed_v0.pkl']

In [7]:
%%time

with fs.open('ktracing/validation_v0-000000000000.csv') as f:
    df_val = pd.read_csv(f, dtype=dtypes)
print(df_val.head())


     row_id    timestamp    user_id  content_id  content_type_id  \
0  26744957  15641556504  570755459        6732            False   
1  14035461  13777137344  303797635        2695            False   
2   8715551  11729193944  189628710        5450            False   
3  28104169   3851379344  600053766        7147            False   
4  16388976   1828721641  352075120        7676            False   

   task_container_id  user_answer  answered_correctly  \
0               3033            0                   0   
1               1175            0                   0   
2                164            0                   0   
3                828            0                   0   
4                265            0                   0   

   prior_question_elapsed_time  prior_question_had_explanation  
0                      38500.0                            True  
1                       4333.0                            True  
2                      49000.0                       

In [8]:
# df_val.to_pickle('validation_v0.pkl')
# encoders.to_pickle('encoders_content_id_v0.pkl')
# embeddings.to_pickle('embeddings_content_id_v0.pkl')
print(df_val['answered_correctly'].mean())

0.636376067788862


In [9]:

X_val, Y_val = get_train_data(df_val)

print(X_val.head())

     row_id    timestamp    user_id  content_id  content_type_id  \
0  26744957  15641556504  570755459        6732            False   
1  14035461  13777137344  303797635        2695            False   
2   8715551  11729193944  189628710        5450            False   
3  28104169   3851379344  600053766        7147            False   
4  16388976   1828721641  352075120        7676            False   

   task_container_id  user_answer  prior_question_elapsed_time  \
0               3033            0                      38500.0   
1               1175            0                       4333.0   
2                164            0                      49000.0   
3                828            0                      73000.0   
4                265            0                      18666.0   

   prior_question_had_explanation  answered_correctly_user    sum  \
0                            True                 0.678851  260.0   
1                            True                 0.7500

In [10]:
from sklearn.metrics import roc_auc_score

X_val_embedded = add_embedding(X_val,encoders, embeddings)
Y_pred = model.predict_proba(X_val_embedded[features])[:, 1]

print('score:', roc_auc_score(Y_val, Y_pred))

print('pred:', Y_pred.mean())
print('true:', Y_val.mean())

score: 0.7933783256717847
pred: 0.6740613346090522
true: answered_correctly    0.667605
dtype: float64


In [11]:
import joblib
# save model
joblib.dump(model, 'lgb.pkl')

['lgb.pkl']