In [1]:
import re, random, torch, pickle, gc, os, sklearn
import optuna
import lightgbm as lgb
import polars as pl
import pandas as pd
import numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
## Sklearn package
from sklearn.linear_model import Lasso, Ridge, RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, ExtraTreesRegressor, BaggingClassifier, HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score 
from scipy.stats import skew, kurtosis
import ctypes
libc = ctypes.CDLL("libc.so.6")  # clear the memory
import warnings
import sys
import pyarrow as pa
import json
from itertools import chain, combinations
from random import randint
warnings.filterwarnings("ignore")
pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 99

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = ''
train_logs = pl.scan_csv(data_path + 'train_logs.csv')

In [3]:
DEBUG = False
SEED = 42
N_FOLD = 5



In [4]:
def process_essays(tlogs, tscore):
    essays = {}
    unq = pd.unique(tlogs["id"])
    print(tlogs)
    cnt = 0
    ln = len(unq)
    for id in unq:
        essays[id] = {}
        essays[id]["series"] = tlogs[tlogs["id"] == id]
        cnt += 1

    ix = 0
    if tscore is not None:
        while ix < len(tscore):
            id = tscore.iloc[ix]["id"]
            essays[id]["score"] = tscore.iloc[ix]["score"]
            ix += 1
    return essays

In [5]:
def fourier(seq):
    return np.abs(np.fft.fft(seq))

def get_freq(seq):
    N = len(seq)
    n = np.arange(N)
    freq = n / N
    return freq[1: len(seq) // 2]

def get_power(fr):
    return fr.sum()


def func_entropy(seq):
    if len(seq) == 0:
        return 0
    fr = fourier(seq)
    pwr = get_power(fr)
    pr = fr / pwr
    out = 0
    for val in pr:
        out -= (val * np.log(val)) 
    return out

In [6]:
def get_rbursts(srs):
    bursts = []
    prods = srs["activity"]
    cnt = 0
    for prod in prods:
        if prod != "Input":
            bursts.append(cnt)
            cnt = 0
        else:
            cnt += 1
    bursts.append(cnt)
    return bursts

In [7]:
def process_essays(tlogs, tscore):
    essays = {}
    unq = pd.unique(tlogs["id"])
    cnt = 0
    ln = len(unq)
    print(unq)
    for id in unq:
        print(id)
        essays[id] = {}
        essays[id]["series"] = tlogs[tlogs["id"] == id]
        cnt += 1

    ix = 0
    if tscore is not None:
        while ix < len(tscore):
            id = tscore.iloc[ix]["id"]
            essays[id]["score"] = tscore.iloc[ix]["score"]
            ix += 1
    return essays

In [8]:
def get_pauses(srs):
    pause_lengths = [srs.iloc[0]["down_time"]]
    pause_times = [0]
    ix = 1
    while ix < len(srs) - 1:
        start = srs.iloc[ix]["up_time"]
        end = srs.iloc[ix+1]["down_time"]
        
        while end - start  <= 0:
            ix += 1
            if ix == len(srs) - 1:
                break
            end = srs.iloc[ix+1]["down_time"]
            start = srs.iloc[ix]["up_time"]
        if end - start > 2000:
            pause_lengths.append(end - start)
            pause_times.append(srs.iloc[ix]["up_time"])
        ix += 1
    return (pause_lengths, pause_times)

In [9]:
def get_pauses(srs):
    pause_lengths = [srs.iloc[0]["down_time"]]
    pause_times = [0]
    ix = 1
    while ix < len(srs) - 1:
        start = srs.iloc[ix]["up_time"]
        end = srs.iloc[ix+1]["down_time"]
        
        while end - start  <= 0:
            ix += 1
            if ix == len(srs) - 1:
                break
            end = srs.iloc[ix+1]["down_time"]
            start = srs.iloc[ix]["up_time"]
        if end - start > 2000:
            pause_lengths.append(end - start)
            pause_times.append(srs.iloc[ix]["up_time"])
        ix += 1
    return (pause_lengths, pause_times)

In [10]:
def get_words(srs):
    prods = srs[srs["activity"] == "Input"]
    chars = prods["down_event"]
    words = []
    word = []
    lens = []
    for char in chars:
        if char != "Space":
            word.append(char)
        else:
            st = "".join(word)
            st = st.replace(".", "").replace("?", "").replace("!", "").replace(",", "")
            words.append(st)
            lens.append(len(st))
            word = []
    st = "".join(word)
    st = st.replace(".", "").replace("?", "").replace("!", "").replace(",", "")
    words.append(st)
    lens.append(len(st))
    word = []
    return (words, lens)

def get_sentences(srs):
    prods = srs[srs["activity"] == "Input"]
    chars = prods["down_event"]
    sents = []
    sent = []
    lens = []
    for char in chars:
        if char not in [".", "?", "!", "Space"]:
            sent.append(char)
        elif char == "Space":
            sent.append(" ")
        else:
            st = "".join(sent)
            st = st.replace(".", "").replace("?", "").replace("!", "").replace(",", "")
            sents.append(sent)
            lens.append(len(sent))
            sent = []
    st = "".join(sent)
    st = st.replace(".", "").replace("?", "").replace("!", "").replace(",", "")
    sents.append(sent)
    lens.append(len(sent))
    
    return (sents, lens)

def get_paragraphs(srs):
    prods = srs[srs["activity"] == "Input"]
    chars = prods["down_event"]
    paras = 0
    for char in chars:
        if char != "Enter":
            pass
        else:
            paras += 1
    paras += 1
    return paras

def get_rbursts(srs):
    bursts = []
    prods = srs["activity"]
    cnt = 0
    for prod in prods:
        if prod != "Input":
            bursts.append(cnt)
            cnt = 0
        else:
            cnt += 1
    bursts.append(cnt)
    return bursts

In [11]:
def get_essay_moments(essays):
    for key in essays.keys():
        print(key)
        val = essays[key]["series"]
        input_val = val[val["activity"] == "Input"]
        non_input_val = val[val["activity"] != "Input"]
        #val = val[val["activity"] == "Input"]
        print(key)
        essays[key]["at_mean"] = val["action_time"].mean()
        essays[key]["input_mean"] = input_val["action_time"].mean()
        essays[key]["ninput_mean"] = non_input_val["action_time"].mean()

        essays[key]["at_median"] = val["action_time"].median()
        essays[key]["input_median"] = input_val["action_time"].median()
        essays[key]["ninput_median"] = non_input_val["action_time"].median()
    
        essays[key]["at_std"] = val["action_time"].std()
        essays[key]["input_std"] = input_val["action_time"].std()
        essays[key]["ninput_std"] = non_input_val["action_time"].std()
    
        essays[key]["at_skew"] = val["action_time"].skew()
        essays[key]["input_skew"] = input_val["action_time"].skew()
        essays[key]["ninput_skew"] = non_input_val["action_time"].skew()
    
        essays[key]["at_kurtosis"] = val["action_time"].kurtosis()
        essays[key]["input_kurtosis"] = input_val["action_time"].kurtosis()
        essays[key]["ninput_kurtosis"] = non_input_val["action_time"].kurtosis()
    
        essays[key]["entropy"] = func_entropy(val["action_time"])
        essays[key]["input_entropy"] = func_entropy(input_val["action_time"])
        essays[key]["ninput_entropy"] = func_entropy(non_input_val["action_time"])
    return essays

In [12]:
def get_other_moments(essays):
    for key in essays.keys():
        print(key)
        val = essays[key]["series"]
        ts = []
        mn = []
        fmn = []
        for ix in range(len(val)):
            ts.append(val.iloc[ix]["down_time"] // 10000)
            mn.append(val.iloc[ix]["down_time"] // 60000)

        val["10s"] = ts
        val["mn"] = mn

        ival = val[val["activity"] == "Input"]
        nival = val[val["activity"] != "Input"]

        rval = val[val["activity"] == "Remove/Cut"]
        essays[key]["removal_ratio"] = (1 + len(rval)) / (1 + len(val))
        essays[key]["num_paras"] = get_paragraphs(val)
        pauses, pause_times = get_pauses(val)
        pbursts = []
        for ix in range(len(pause_times)):
            if ix == 0:
                start = 0
            else:
                start = pause_times[ix-1]
            end = pause_times[ix]
            interval = val[(val["down_time"] >= start) & (val["down_time"] <= end)]
            pbursts.append(len(interval[interval["activity"] == "Input"]))

        _, word_lens = get_words(val)
        _, sent_lens = get_sentences(val)

        word_lens = pd.Series(word_lens)
        sent_lens = pd.Series(sent_lens)

        rbursts = pd.Series(get_rbursts(val))
        
        pbursts = pd.Series(pbursts)
        essays[key]["pburst_mean"] = pbursts.mean()
        essays[key]["pburst_median"] = pbursts.median()
        essays[key]["pburst_std"] = pbursts.std()
        essays[key]["pburst_skew"] = pbursts.skew()
        essays[key]["pburst_kurt"] = pbursts.kurtosis()
        essays[key]["pburst_ent"] = func_entropy(pbursts)

        essays[key]["rburst_mean"] = rbursts.mean()
        essays[key]["rburst_median"] = rbursts.median()
        essays[key]["rburst_std"] = rbursts.std()
        essays[key]["rburst_skew"] = rbursts.skew()
        essays[key]["rburst_kurt"] = rbursts.kurtosis()
        essays[key]["rburst_ent"] = func_entropy(rbursts)
        
        essays[key]["wlen_mean"] = word_lens.mean()
        essays[key]["wlen_median"] = word_lens.median()
        essays[key]["wlen_std"] = word_lens.std()
        essays[key]["wlen_skew"] = word_lens.skew()
        essays[key]["wlen_kurt"] = word_lens.kurtosis()
        essays[key]["wlen_ent"] = func_entropy(word_lens)

        essays[key]["slen_mean"] = sent_lens.mean()
        essays[key]["slen_median"] = sent_lens.median()
        essays[key]["slen_std"] = sent_lens.std()
        essays[key]["slen_skew"] = sent_lens.skew()
        essays[key]["slen_kurt"] = sent_lens.kurtosis()
        essays[key]["slen_ent"] = func_entropy(sent_lens)
        
        pauses = pd.DataFrame({"pause" : pauses})
        total_pause = pauses["pause"].sum()
        total_pauses = len(pauses["pause"])
        pauses_10s = []
        pauses_mn = []
        tmx = pause_times[-1] // 10000
        fmx = pause_times[-1] // 60000
        pause_times = pd.DataFrame({"pt" : pause_times})
        for t in range(tmx):
            pauses_10s.append(len(pause_times[pause_times["pt"] // 10000 == t]))
        for t in range(fmx):
            pauses_mn.append(len(pause_times[pause_times["pt"] // 60000 == t]))
        essays[key]["pause_total"] = total_pause
        essays[key]["total_pauses"] = total_pauses 
        essays[key]["pause_mean"] = pauses["pause"].mean()
        essays[key]["pause_median"] = pauses["pause"].median()
        essays[key]["pause_std"] = pauses["pause"].std()
        essays[key]["pause_skew"] = pauses["pause"].skew()
        essays[key]["pause_kurt"] = pauses["pause"].kurt()
        essays[key]["pause_ent"] = func_entropy(pauses["pause"])
        essays[key]["input_ratio"] = (1 + len(ival)) / (1 + len(nival))
        dot_val = val[val["down_event"] == "."]
        comma_val = val[val["down_event"] == ","]
        space_val = val[val["down_event"] == "Space"]
        backspace_val = val[val["down_event"] == "Backspace"]
        exclamation_val = val[val["down_event"] == "!"]
        quote_val = val[(val["down_event"] == "'") | (val["down_event"] == '"')]
        essays[key]["sentence_ratio"] = len(dot_val) / len(val)
        essays[key]["comma_ratio"] = len(comma_val) / len(val)
        essays[key]["backspace_ratio"] = len(backspace_val) / len(val)
        essays[key]["exclamation_ratio"] = len(backspace_val) / len(val)
        essays[key]["space_ratio"] = len(space_val) / len(val)
        essays[key]["comma_sentence_ratio"] = len(comma_val) / (1 + len(dot_val))
        essays[key]["backspace_sentence_ratio"] = len(backspace_val) / (1 + len(dot_val))
        essays[key]["word_sentence_ratio"] = len(space_val) / (1 + len(dot_val))
        essays[key]["quote_ratio"] = len(quote_val) / len(val)
    
        tscnt = []
        itscnt = []
        rtscnt = []
        nitscnt = []
        ts_words = []
        ts_backspace = []
        ts_pause_cnt = []
        mncnt = []
        imncnt = []
        rmncnt = []
        nimncnt = []
        mn_words = []
        mn_backspace = []
        intsratio = []
        inmnratio = []
        rtsratio = []
        rmnratio = []
        mn_pause_cnt = []
        tsmax = val["10s"].max()
        mnmax = val["mn"].max()

        for t in range(tsmax):
            tm = len(val[val["10s"] == t])
            itm = len(ival[ival["10s"] == t])
            nitm = len(nival[nival["10s"] == t])
            rtm = len(rval[rval["10s"] == t])
        
            words = len(space_val[space_val["10s"] == t])
            backspace = len(backspace_val[backspace_val["10s"] == t])
            iniratio = (1 + itm) / (1 + nitm)
            rmratio = rtm / (1 + tm)
            tscnt.append(tm)
            rtscnt.append(rtm)
            itscnt.append(itm)
            nitscnt.append(nitm)
            intsratio.append(iniratio)
            rtsratio.append(rmratio)
            ts_words.append(words)
            ts_backspace.append(backspace)

        for t in range(mnmax):
            tm = len(val[val["mn"] == t])
            itm = len(ival[ival["mn"] == t])
            rtm = len(rval[rval["mn"] == t])
            nitm = len(nival[nival["mn"] == t])
            words = len(space_val[space_val["mn"] == t])
            backspace = len(backspace_val[backspace_val["mn"] == t])
            iniratio = (1 + itm) / (1 + nitm)
            rmratio = rtm / (1 + tm)
            mncnt.append(tm)
            imncnt.append(itm)
            rmncnt.append(rtm)
            nimncnt.append(nitm)
            inmnratio.append(iniratio)
            mn_words.append(words)
            mn_backspace.append(backspace)
            rmnratio.append(rmratio)

  
        ts_pause = pd.DataFrame({"10s" : pauses_10s})  
        tscnt = pd.DataFrame({"10s" :tscnt})
        rtscnt = pd.DataFrame({"10s" :rtscnt})
        itscnt = pd.DataFrame({"10s" :itscnt})
        nitscnt = pd.DataFrame({"10s" :nitscnt})
        intsratio = pd.DataFrame({"10s" :intsratio})
        rtsratio = pd.DataFrame({"10s" :rtsratio})
        ts_words = pd.DataFrame({"10s" : ts_words})
        ts_backspace = pd.DataFrame({"10s" : ts_backspace})

        mn_pause = pd.DataFrame({"mn" : pauses_mn})
        mncnt = pd.DataFrame({"mn" :mncnt})
        rmncnt = pd.DataFrame({"mn" :rmncnt})
        imncnt = pd.DataFrame({"mn" :imncnt})
        nimncnt = pd.DataFrame({"mn" :nimncnt})
        inmnratio = pd.DataFrame({"mn" :inmnratio})
        rmnratio = pd.DataFrame({"mn" :rmnratio})
        mn_words = pd.DataFrame({"mn" : mn_words})
        mn_backspace = pd.DataFrame({"mn" : mn_backspace})
    
        essays[key]["10s_entropy"] = func_entropy(tscnt["10s"])
        essays[key]["10s_pause_entropy"] = func_entropy(ts_pause["10s"])
        essays[key]["10s_words_entropy"] = func_entropy(ts_words["10s"])
        essays[key]["10s_backspace_entropy"] = func_entropy(ts_backspace["10s"])
        essays[key]["10s_input_entropy"] = func_entropy(itscnt["10s"])
        essays[key]["10s_removal_entropy"] = func_entropy(rtscnt["10s"])
        essays[key]["10s_noninput_entropy"] = func_entropy(nitscnt["10s"])
        essays[key]["10s_input_ratio_entropy"] = func_entropy(intsratio["10s"])
        essays[key]["10s_removal_ratio_entropy"] = func_entropy(rtsratio["10s"])
    
        essays[key]["mn_entropy"] = func_entropy(mncnt["mn"])
        essays[key]["mn_pause_entropy"] = func_entropy(mn_pause["mn"])
        essays[key]["mn_words_entropy"] = func_entropy(mn_words["mn"])
        essays[key]["mn_backspace_entropy"] = func_entropy(mn_backspace["mn"])
        essays[key]["mn_input_entropy"] = func_entropy(imncnt["mn"])
        essays[key]["mn_removal_entropy"] = func_entropy(rmncnt["mn"])
        essays[key]["mn_noninput_entropy"] = func_entropy(nimncnt["mn"])
        essays[key]["mn_input_ratio_entropy"] = func_entropy(inmnratio["mn"])
        essays[key]["mn_removal_ratio_entropy"] = func_entropy(rmnratio["mn"])

        essays[key]["10s_mean"] = tscnt["10s"].mean()
        essays[key]["10s_pause_mean"] = ts_pause["10s"].mean()
        essays[key]["10s_words_mean"] = ts_words["10s"].mean()
        essays[key]["10s_backspace_mean"] = ts_backspace["10s"].mean()
        essays[key]["10s_input_mean"] = itscnt["10s"].mean()
        essays[key]["10s_removal_mean"] = rtscnt["10s"].mean()
        essays[key]["10s_noninput_mean"] = nitscnt["10s"].mean()
        essays[key]["10s_input_ratio_mean"] = intsratio["10s"].mean()
        essays[key]["10s_removal_ratio_mean"] = rtsratio["10s"].mean()

        essays[key]["10s_median"] = tscnt["10s"].median()
        essays[key]["10s_pause_median"] = ts_pause["10s"].median()
        essays[key]["10s_words_median"] = ts_words["10s"].median()
        essays[key]["10s_backspace_median"] = ts_backspace["10s"].median()
        essays[key]["10s_input_median"] = itscnt["10s"].median()
        essays[key]["10s_removal_median"] = rtscnt["10s"].median()
        essays[key]["10s_noninput_median"] = nitscnt["10s"].median()
        essays[key]["10s_input_ratio_median"] = intsratio["10s"].median()
        essays[key]["10s_removal_ratio_median"] = rtsratio["10s"].median()
    
        essays[key]["mn_mean"] = mncnt["mn"].mean()
        essays[key]["mn_pause_mean"] = mn_pause["mn"].mean()
        essays[key]["mn_words_mean"] = mn_words["mn"].mean()
        essays[key]["mn_backspace_mean"] = mn_backspace["mn"].mean()
        essays[key]["mn_input_mean"] = imncnt["mn"].mean()
        essays[key]["mn_removal_mean"] = rmncnt["mn"].mean()
        essays[key]["mn_noninput_mean"] = nimncnt["mn"].mean()
        essays[key]["mn_input_ratio_mean"] = inmnratio["mn"].mean()
        essays[key]["mn_removal_ratio_mean"] = rmnratio["mn"].mean()

        essays[key]["mn_median"] = mncnt["mn"].median()
        essays[key]["mn_pause_median"] = mn_pause["mn"].median()
        essays[key]["mn_words_median"] = mn_words["mn"].median()
        essays[key]["mn_backspace_median"] = mn_backspace["mn"].median()
        essays[key]["mn_input_median"] = imncnt["mn"].median()
        essays[key]["mn_removal_median"] = rmncnt["mn"].median()
        essays[key]["mn_noninput_median"] = nimncnt["mn"].median()
        essays[key]["mn_input_ratio_median"] = inmnratio["mn"].median()
        essays[key]["mn_removal_ratio_median"] = rmnratio["mn"].median()

        essays[key]["10s_std"] = tscnt["10s"].std()
        essays[key]["10s_pause_std"] = ts_pause["10s"].std()
        essays[key]["10s_words_std"] = tscnt["10s"].std()
        essays[key]["10s_backspace_std"] = tscnt["10s"].std()
        essays[key]["10s_input_std"] = itscnt["10s"].std()
        essays[key]["10s_removal_std"] = rtscnt["10s"].std()
        essays[key]["10s_noninput_std"] = nitscnt["10s"].std()
        essays[key]["10s_input_ratio_std"] = intsratio["10s"].std()
        essays[key]["10s_removal_ratio_std"] = rtsratio["10s"].std()
    
        essays[key]["mn_std"] = mncnt["mn"].std()
        essays[key]["mn_pause_std"] = mn_pause["mn"].std()
        essays[key]["mn_words_std"] = mn_words["mn"].std()
        essays[key]["mn_backspace_std"] = mn_backspace["mn"].std()
        essays[key]["mn_input_std"] = imncnt["mn"].std()
        essays[key]["mn_removal_std"] = rmncnt["mn"].std()
        essays[key]["mn_noninput_std"] = nimncnt["mn"].std()
        essays[key]["mn_input_ratio_std"] = inmnratio["mn"].std()
        essays[key]["mn_removal_ratio_std"] = rmnratio["mn"].std()

        essays[key]["10s_skew"] = tscnt["10s"].skew()
        essays[key]["10s_pause_skew"] = ts_pause["10s"].skew()
        essays[key]["10s_words_skew"] = ts_words["10s"].skew()
        essays[key]["10s_backspace_skew"] = ts_backspace["10s"].skew()
        essays[key]["10s_input_skew"] = itscnt["10s"].skew()
        essays[key]["10s_removal_skew"] = rtscnt["10s"].skew()
        essays[key]["10s_noninput_skew"] = nitscnt["10s"].skew()
        essays[key]["10s_input_ratio_skew"] = intsratio["10s"].skew()
        essays[key]["10s_removal_ratio_skew"] = rtsratio["10s"].skew()
    
        essays[key]["mn_skew"] = mncnt["mn"].skew()
        essays[key]["mn_pause_skew"] = mn_pause["mn"].skew()
        essays[key]["mn_words_skew"] = mn_words["mn"].skew()
        essays[key]["mn_backspace_skew"] = mn_backspace["mn"].skew()
        essays[key]["mn_input_skew"] = imncnt["mn"].skew()
        essays[key]["mn_removal_skew"] = rmncnt["mn"].skew()
        essays[key]["mn_noninput_skew"] = nimncnt["mn"].skew()
        essays[key]["mn_input_ratio_skew"] = inmnratio["mn"].skew()
        essays[key]["mn_removal_ratio_skew"] = rmnratio["mn"].skew()

        essays[key]["10s_kurt"] = tscnt["10s"].kurtosis()
        essays[key]["10s_pause_kurt"] = ts_pause["10s"].kurtosis()
        essays[key]["10s_words_kurt"] = ts_words["10s"].kurtosis()
        essays[key]["10s_backspace_kurt"] = ts_backspace["10s"].kurtosis()
        essays[key]["10s_input_kurt"] = itscnt["10s"].kurtosis()
        essays[key]["10s_removal_kurt"] = rtscnt["10s"].kurtosis()
        essays[key]["10s_noninput_kurt"] = nitscnt["10s"].kurtosis()
        essays[key]["10s_input_ratio_kurt"] = intsratio["10s"].kurtosis()
        essays[key]["10s_removal_ratio_kurt"] = rtsratio["10s"].kurtosis()
    
        essays[key]["mn_kurt"] = mncnt["mn"].kurtosis()
        essays[key]["mn_pause_kurt"] = mn_pause["mn"].kurtosis()
        essays[key]["mn_words_kurt"] = mn_words["mn"].kurtosis()
        essays[key]["mn_backspace_kurt"] = mn_backspace["mn"].kurtosis()
        essays[key]["mn_input_kurt"] = imncnt["mn"].kurtosis()
        essays[key]["mn_removal_kurt"] = rmncnt["mn"].kurtosis()
        essays[key]["mn_noninput_kurt"] = nimncnt["mn"].kurtosis()
        essays[key]["mn_input_ratio_kurt"] = inmnratio["mn"].kurtosis()
        essays[key]["mn_removal_ratio_kurt"] = rmnratio["mn"].kurtosis()
    return essays

In [13]:
def get_essay_df(essays, isTrain):
    at_mean = []
    input_mean = []
    pburst_mean = []
    rburst_mean = []
    ninput_mean = []
    ts_mean = []
    ts_pause_mean = []
    ts_removal_mean = []
    ts_words_mean = []
    mn_words_mean = []
    ts_backspace_mean = []
    mn_backspace_mean = []
    mn_mean = []
    mn_pause_mean = []
    mn_removal_mean = []
    ts_input_rat_mean = []
    mn_input_rat_mean = []
    pause_lmean = []
    pause_lstd = []
    pause_lskew = []
    pause_lkurt  = []
    pause_lent = []
    pause_total = []
    total_pauses = []

    wlen_mean = []
    wlen_median = []
    wlen_std = []
    wlen_skew = []
    wlen_kurt = []
    wlen_ent = []

    slen_mean = []
    slen_median = []
    slen_std = []
    slen_skew = []
    slen_kurt = []
    slen_ent = []
    
    at_median = []
    input_median = []
    pburst_median = []
    rburst_median = []
    ninput_median = []
    ts_median = []
    ts_pause_median = []
    ts_removal_median = []
    ts_words_median = []
    mn_words_median = []
    ts_backspace_median = []
    mn_backspace_median = []
    mn_median = []
    mn_pause_median = []
    mn_removal_median = []
    ts_input_rat_median = []
    mn_input_rat_median = []
    pause_lmedian = []


    at_std = []
    input_std = []
    pburst_std = []
    rburst_std = []
    ninput_std = []
    ts_std = []
    ts_pause_std = []
    ts_removal_std = []
    mn_std = []
    mn_pause_std = []
    mn_removal_std = []
    ts_words_std = []
    mn_words_std = []
    ts_backspace_std = []
    mn_backspace_std = []
    ts_input_rat_std = []
    mn_input_rat_std = []
    pause_lstd = []

    at_skew = []
    input_skew = []
    pburst_skew = []
    rburst_skew = []
    ninput_skew = []
    ts_skew = []
    ts_pause_skew = []
    ts_removal_skew = []
    mn_skew = []
    mn_pause_skew = []
    mn_removal_skew = []
    ts_words_skew = []
    mn_words_skew = []
    ts_backspace_skew = []
    mn_backspace_skew = []
    ts_input_rat_skew = []
    mn_input_rat_skew = []


    at_kurt = []
    input_kurt = []
    pburst_kurt = []
    rburst_kurt = []
    ninput_kurt = []
    ts_kurt = []
    ts_pause_kurt = []
    ts_removal_kurt = []
    mn_kurt = []
    mn_pause_kurt = []
    mn_removal_kurt = []
    ts_words_kurt = []
    mn_words_kurt = []
    ts_backspace_kurt = []
    mn_backspace_kurt = []
    ts_input_rat_kurt = []
    mn_input_rat_kurt = []


    ent = []
    input_ent = []
    pburst_ent = []
    rburst_ent = []
    ninput_ent = []
    ts_ent = []
    ts_pause_ent = []
    ts_removal_ent = []
    mn_ent = []
    mn_pause_ent = []
    mn_removal_ent = []
    ts_words_ent = []
    mn_words_ent = []
    ts_backspace_ent = []
    mn_backspace_ent = []
    ts_input_rat_ent = []
    mn_input_rat_ent = []
    pause_lent = []
    sentence_rat = []
    comma_rat = []
    backspace_rat = []
    exclamation_rat = []
    space_rat = []
    comma_sentence_rat = []
    word_sentence_rat = []
    backspace_sentence_rat = []
    input_rat = []
    quote_rat = []
    num_paras = []
    ids = []
    scr = []
    for key in essays.keys():
        val = essays[key]
        print(key)
        ids.append(key)
        sentence_rat.append(val["sentence_ratio"])
        comma_rat.append(val["comma_ratio"])
        backspace_rat.append(val["comma_ratio"])
        exclamation_rat.append(val["exclamation_ratio"])
        space_rat.append(val["space_ratio"])
        comma_sentence_rat.append(val["comma_sentence_ratio"])
        word_sentence_rat.append(val["word_sentence_ratio"])
        input_rat.append(val["input_ratio"])
        quote_rat.append(val["quote_ratio"])
        backspace_sentence_rat.append(val["backspace_sentence_ratio"])
        pause_lmean.append(val["pause_mean"])
        pause_lstd.append(val["pause_std"])
        pause_lskew.append(val["pause_skew"])
        pause_lkurt.append(val["pause_kurt"])
        pause_lent.append(val["pause_ent"])
        pause_total.append(val["pause_total"])
        total_pauses.append(val["total_pauses"])
        num_paras.append(val["num_paras"])
        at_mean.append(val["at_mean"])
        input_mean.append(val["input_mean"])
        pburst_mean.append(val["pburst_mean"])
        rburst_mean.append(val["rburst_mean"])
        ninput_mean.append(val["ninput_mean"])

        wlen_mean.append(val["wlen_mean"])
        wlen_median.append(val["wlen_median"])
        wlen_std.append(val["wlen_std"])
        wlen_skew.append(val["wlen_skew"])
        wlen_kurt.append(val["wlen_kurt"])
        wlen_ent.append(val["wlen_ent"])

        slen_mean.append(val["wlen_mean"])
        slen_median.append(val["wlen_median"])
        slen_std.append(val["wlen_std"])
        slen_skew.append(val["wlen_skew"])
        slen_kurt.append(val["wlen_kurt"])
        slen_ent.append(val["wlen_ent"])
        
        at_median.append(val["at_median"])
        input_median.append(val["input_median"])
        ninput_median.append(val["ninput_median"])
        pburst_median.append(val["pburst_median"])
        rburst_median.append(val["rburst_median"])
        
        ts_mean.append(val["10s_mean"])
        ts_pause_mean.append(val["10s_pause_mean"])
        ts_removal_mean.append(val["10s_removal_mean"])
        mn_mean.append(val["mn_mean"])
        mn_pause_mean.append(val["mn_pause_mean"])
        mn_removal_mean.append(val["mn_removal_mean"])
        ts_words_mean.append(val["10s_words_mean"])
        mn_words_mean.append(val["mn_words_mean"])
        ts_backspace_mean.append(val["10s_backspace_mean"])
        mn_backspace_mean.append(val["mn_backspace_mean"])
        ts_input_rat_mean.append(val["10s_input_ratio_mean"])
        mn_input_rat_mean.append(val["mn_input_ratio_mean"])

        ts_median.append(val["10s_median"])
        ts_pause_median.append(val["10s_pause_median"])
        ts_removal_median.append(val["10s_removal_median"])
        mn_median.append(val["mn_median"])
        mn_pause_median.append(val["mn_pause_median"])
        mn_removal_median.append(val["mn_removal_median"])
        ts_words_median.append(val["10s_words_median"])
        mn_words_median.append(val["mn_words_median"])
        ts_backspace_median.append(val["10s_backspace_median"])
        mn_backspace_median.append(val["mn_backspace_median"])
        ts_input_rat_median.append(val["10s_input_ratio_median"])
        mn_input_rat_median.append(val["mn_input_ratio_median"])
    
        at_std.append(val["at_std"])
        input_std.append(val["input_std"])
        pburst_std.append(val["pburst_std"])
        rburst_std.append(val["rburst_std"])
        ninput_std.append(val["ninput_std"])
        ts_std.append(val["10s_std"])
        ts_pause_std.append(val["10s_pause_std"])
        ts_removal_std.append(val["10s_removal_std"])
        mn_std.append(val["mn_std"])
        mn_pause_std.append(val["mn_pause_std"])
        mn_removal_std.append(val["mn_removal_std"])
        ts_words_std.append(val["10s_words_std"])
        mn_words_std.append(val["mn_words_std"])
        ts_backspace_std.append(val["10s_backspace_std"])
        mn_backspace_std.append(val["mn_backspace_std"])
        ts_input_rat_std.append(val["10s_input_ratio_std"])
        mn_input_rat_std.append(val["mn_input_ratio_std"])
    
        at_skew.append(val["at_skew"])
        input_skew.append(val["input_skew"])
        pburst_skew.append(val["pburst_skew"])
        rburst_skew.append(val["rburst_skew"])
        ninput_skew.append(val["ninput_skew"])
        ts_skew.append(val["10s_skew"])
        ts_pause_skew.append(val["10s_pause_skew"])
        ts_removal_skew.append(val["10s_removal_skew"])
        mn_skew.append(val["mn_skew"])
        mn_pause_skew.append(val["mn_pause_skew"])
        mn_removal_skew.append(val["mn_removal_skew"])
        ts_words_skew.append(val["10s_words_skew"])
        mn_words_skew.append(val["mn_words_skew"])
        ts_backspace_skew.append(val["10s_backspace_skew"])
        mn_backspace_skew.append(val["mn_backspace_skew"])
        ts_input_rat_skew.append(val["10s_input_ratio_skew"])
        mn_input_rat_skew.append(val["mn_input_ratio_skew"])
    
        at_kurt.append(val["at_kurtosis"])
        input_kurt.append(val["input_kurtosis"])
        pburst_kurt.append(val["pburst_kurt"])
        rburst_kurt.append(val["rburst_kurt"])
        ninput_kurt.append(val["ninput_kurtosis"])
        ts_kurt.append(val["10s_kurt"])
        ts_pause_kurt.append(val["10s_pause_kurt"])
        ts_removal_kurt.append(val["10s_removal_kurt"])
        mn_kurt.append(val["mn_kurt"])
        mn_pause_kurt.append(val["mn_pause_kurt"])
        mn_removal_kurt.append(val["mn_removal_kurt"])
        ts_words_kurt.append(val["10s_words_kurt"])
        mn_words_kurt.append(val["mn_words_kurt"])
        ts_backspace_kurt.append(val["10s_backspace_kurt"])
        mn_backspace_kurt.append(val["mn_backspace_kurt"])
        ts_input_rat_kurt.append(val["10s_input_ratio_kurt"])
        mn_input_rat_kurt.append(val["mn_input_ratio_kurt"])
    
        ent.append(val["entropy"])
        input_ent.append(val["input_entropy"])
        pburst_ent.append(val["pburst_ent"])
        rburst_ent.append(val["rburst_ent"])
        ninput_ent.append(val["ninput_entropy"])
        ts_ent.append(val["10s_entropy"])
        ts_pause_ent.append(val["10s_pause_entropy"])
        ts_removal_ent.append(val["10s_removal_entropy"])
        mn_ent.append(val["mn_entropy"])
        mn_pause_ent.append(val["mn_pause_entropy"])
        mn_removal_ent.append(val["mn_removal_entropy"])
        ts_words_ent.append(val["10s_words_entropy"])
        mn_words_ent.append(val["mn_words_entropy"])
        ts_backspace_ent.append(val["10s_backspace_entropy"])
        mn_backspace_ent.append(val["mn_backspace_entropy"])
        ts_input_rat_ent.append(val["10s_input_ratio_entropy"])
        mn_input_rat_ent.append(val["mn_input_ratio_entropy"])
    
        if isTrain:
            scr.append(val["score"])


    lst_dct = {"at_mean" : at_mean,
        "pause_mean" : pause_lmean,
        "input_mean" : input_mean,
        "pburst_mean" : pburst_mean,
        "rburst_mean" : rburst_mean,
        "ninput_mean" : ninput_mean,
        "10s_mean" : ts_mean,
        "10s_pause_mean" : ts_pause_mean,
        "10s_removal_mean" : ts_removal_mean,
        "mn_mean" : mn_mean,
        "mn_pause_mean" : mn_pause_mean,
        "mn_removal_mean" : mn_removal_mean,
        "10s_words_mean" : ts_words_mean,
        "mn_words_mean" : mn_words_mean,
        "10s_backspace_mean" : ts_backspace_mean,
        "mn_backspace_mean" : mn_backspace_mean,
        "10s_input_rat_mean" : ts_input_rat_mean,
        "mn_input_rat_mean" : mn_input_rat_mean,
        "at_std": at_std,
        "pause_std": pause_lstd,
        "input_std" : input_std,
        "pburst_std" : pburst_std,
        "rburst_std" : rburst_std,
        "ninput_std" : ninput_std,
        "10s_std" : ts_std,
        "10s_pause_std" : ts_pause_std,
        "10s_removal_std" : ts_removal_std,
        "mn_std" : mn_std,
        "mn_pause_std" : mn_pause_std,
        "mn_removal_std" : mn_removal_std,
        "10s_words_std" : ts_words_std,
        "mn_words_std" : mn_words_std,
        "10s_backspace_std" : ts_backspace_std,
        "mn_backspace_std" : mn_backspace_std,
        "10s_input_rat_std" : ts_input_rat_std,
        "mn_input_rat_std" : mn_input_rat_std,
        "at_skew": at_skew,
        "pause_skew": pause_lskew,
        "input_skew": input_skew,
        "pburst_skew" : pburst_skew,
        "rburst_skew" : rburst_skew,
        "ninput_skew" : ninput_skew,
        "10s_skew" : ts_skew,
        "10s_pause_skew" : ts_pause_skew,
        "10s_removal_skew" : ts_removal_skew,
        "mn_skew" : mn_skew,
        "mn_pause_skew" : mn_pause_skew,
        "mn_removal_skew" : mn_removal_skew,
        "10s_words_skew" : ts_words_skew,
        "mn_words_skew" : mn_words_skew,
        "10s_backspace_skew" : ts_backspace_skew,
        "mn_backspace_skew" : mn_backspace_skew,
        "10s_input_rat_skew" : ts_input_rat_skew,
        "mn_input_rat_skew" : mn_input_rat_skew,
        "at_kurt": at_kurt, 
        "pause_kurt": pause_lkurt,
        "input_kurt" : input_kurt,
        "pburst_kurt" : pburst_kurt,
        "rburst_kurt" : rburst_kurt,
        "ninput_kurt" : ninput_kurt,
        "10s_kurt" : ts_kurt,
        "10s_pause_kurt" : ts_pause_kurt,
        "10s_removal_kurt" : ts_removal_kurt,
        "mn_kurt" : mn_kurt,
        "mn_pause_lkurt" : mn_pause_kurt,
        "mn_removal_kurt" : mn_removal_kurt,
        "10s_words_kurt" : ts_words_kurt,
        "mn_words_kurt" : mn_words_kurt,
        "10s_backspace_kurt" : ts_backspace_kurt,
        "mn_backspace_kurt" : mn_backspace_kurt,
        "10s_input_rat_kurt" : ts_input_rat_kurt,
        "mn_input_rat_kurt" : mn_input_rat_kurt,
        "ent" : ent, 
        "input_ent" : input_ent,
        "pburst_ent" : pburst_ent,
        "rburst_ent" : rburst_ent,
        "pause_ent" : pause_lent,
        "ninput_ent" : ninput_ent,
        "10s_ent" : ts_ent,
        "10s_pause_ent" : ts_pause_ent,
        "10s_removal_ent" : ts_removal_ent,
        "mn_ent" : mn_ent,
        "mn_pause_ent" : mn_pause_ent,
        "mn_removal_ent" : mn_removal_ent,
        "10s_words_ent" : ts_words_ent,
        "mn_words_ent" : mn_words_ent,
        "10s_backspace_ent" : ts_backspace_ent,
        "mn_backspace_ent" : mn_backspace_ent,
        "10s_input_rat_ent" : ts_input_rat_ent,
        "mn_input_rat_ent" : mn_input_rat_ent,
        "sentence_rat" : sentence_rat,
        "comma_rat" : comma_rat,
        "backspace_rat" : backspace_rat,
        "exclamation_rat" : exclamation_rat,
        "space_rat" : space_rat,
        "comma_sentence_rat" : comma_sentence_rat,
        "word_sentence_rat" : word_sentence_rat,
        "input_rat" : input_rat,
        "quote_rat" : quote_rat,
        "backspace_sentence_rat" : backspace_sentence_rat,
        "wlen_mean" : wlen_mean,
        "wlen_std" : wlen_std,
        "wlen_skew" : wlen_skew,
        "wlen_kurt" : wlen_kurt,
        "wlen_ent" : wlen_ent,
        "slen_mean" : slen_mean,
        "slen_std" : slen_std,
        "slen_skew" : slen_skew,
        "slen_kurt" : slen_kurt,
        "slen_ent" : slen_ent,
        "pause_total" : pause_total,
        "total_pauses" : total_pauses,
        "id" : ids,
        "num_paras" : num_paras}

    if isTrain:
        lst_dct["score"] = scr

    return pd.DataFrame(lst_dct)
    

In [14]:
class FeatureExtractor():
    def __init__(self, logs):
        self.logs = logs # Training logs
        
    def count_by_values(self, colname, used_cols):
        fts = self.logs.select(pl.col('id').unique(maintain_order=True))
        for i, col in enumerate(used_cols):
            tmp_logs = self.logs.group_by('id').agg(
                            pl.col(colname).is_in([col]).sum().alias(f'{colname}_{i}_cnt')
                                    )
            fts  = fts.join(tmp_logs, on='id', how='left') 
        return fts
    
    # Create the features from statistics of activities, text changes, events
    def create_count_by_values_feats(self):
        activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
        events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.',
                       ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']        
        #=== Create the feature columns using count by values ===
        df = self.count_by_values('activity', activities) # Create 'activity' column
        df = df.join(self.count_by_values('text_change', text_changes), on='id', how='left')
        df = df.join(self.count_by_values('down_event', events), on='id', how='left') 
        df = df.join(self.count_by_values('up_event', events), on='id', how='left')
        print(df.collect().head())
        return df
    
    # Create the features 
    def create_input_words_feats(self):
        # Filter no changes 
        df = self.logs.filter((~pl.col('text_change').str.contains('=>')) & (pl.col('text_change') != 'NoChange'))
        # Aggregate the text changes by id
        df = df.group_by('id').agg(pl.col('text_change').str.concat('').str.extract_all(r'q+'))
        # creates only two columns ('id' and 'text_change') 
        df = df.with_columns(input_word_count=pl.col('text_change').list.lengths(),
                             input_word_length_mean=pl.col('text_change').apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_max=pl.col('text_change').apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_std=pl.col('text_change').apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_median=pl.col('text_change').apply(lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_skew=pl.col('text_change').apply(lambda x: skew([len(i) for i in x] if len(x) > 0 else 0)))
        df = df.drop('text_change') # Remove 'text_change' to avoid including duplicated `text_change` column
        return df
    
    # Create the statistical numeric features (e.g. sum, median, mean min, 0.5_quantile)
    def create_numeric_feats(self):
        num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']
        df = self.logs.group_by("id").agg(pl.sum('action_time').suffix('_sum'),
                                                pl.mean(num_cols).suffix('_mean'),
                                                pl.std(num_cols).suffix('_std'),
                                                pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'), pl.max(num_cols).suffix('_max'),
                                                pl.quantile(num_cols, 0.5).suffix('_quantile'))
        return df
    
    def create_categorical_feats(self):
        df  = self.logs.group_by("id").agg(
                pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
        return df
    
    # Create the idle time features 
    def create_idle_time_feats(self):
        df = self.logs.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
        df = df.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
        df = df.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
        df = df.group_by("id").agg(inter_key_largest_lantency = pl.max('time_diff'),
                                   inter_key_median_lantency = pl.median('time_diff'),
                                   mean_pause_time = pl.mean('time_diff'),
                                   std_pause_time = pl.std('time_diff'),
                                   total_pause_time = pl.sum('time_diff'),
                                   pauses_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') < 1)).count(),
                                   pauses_1_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') < 1.5)).count(),
                                   pauses_1_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1.5) & (pl.col('time_diff') < 2)).count(),
                                   pauses_2_sec = pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') < 3)).count(),
                                   pauses_3_sec = pl.col('time_diff').filter(pl.col('time_diff') > 3).count(),)
        
        return df
    
    # Create p-bursts features using up and down time and activity
    def create_p_bursts_feats(self):
        df = self.logs.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
        df = df.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
        df = df.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
        df = df.with_columns(pl.col('time_diff')<2)
        df = df.with_columns(pl.when(pl.col("time_diff") & pl.col("time_diff").is_last()).then(pl.count()).over(pl.col("time_diff").rle_id()).alias('P-bursts'))
        df = df.drop_nulls()
        df = df.group_by("id").agg(pl.mean('P-bursts').suffix('_mean'),
                                   pl.std('P-bursts').suffix('_std'),
                                   pl.count('P-bursts').suffix('_count'),
                                   pl.median('P-bursts').suffix('_median'),
                                   pl.max('P-bursts').suffix('_max'),
                                   pl.first('P-bursts').suffix('_first'),
                                   pl.last('P-bursts').suffix('_last'))
        return df
    
    # Create r-burst features using activity 
    def create_r_bursts_feats(self):
        df = self.logs.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
        df = df.with_columns(pl.col('activity').is_in(['Remove/Cut']))
        df = df.with_columns(pl.when(pl.col("activity") & pl.col("activity").is_last()).then(pl.count()).over(pl.col("activity").rle_id()).alias('R-bursts'))
        df = df.drop_nulls()
        df = df.group_by("id").agg(pl.mean('R-bursts').suffix('_mean'),
                                   pl.std('R-bursts').suffix('_std'), 
                                   pl.median('R-bursts').suffix('_median'),
                                   pl.max('R-bursts').suffix('_max'),
                                   pl.first('R-bursts').suffix('_first'),
                                   pl.last('R-bursts').suffix('_last'))
        return df

    # Main function creates all 122 features
    def create_feats(self):
        feats = self.create_count_by_values_feats()  # 52 columns in total
#         print(f"< Count by values features > {len(feats.columns)}")        
        feats = feats.join(self.create_input_words_feats(), on='id', how='left')  # 58 columns
#         print(f"< Input words stats features > {len(feats.columns)}")
        feats = feats.join(self.create_numeric_feats(), on='id', how='left') # 89 columns
#         print(f"< Numerical features > {len(feats.columns)}")
        feats = feats.join(self.create_categorical_feats(), on='id', how='left') # 93 columns      
#         print(f"< Categorical features > {len(feats.columns)}")
        feats = feats.join(self.create_idle_time_feats(), on='id', how='left') # 103 columns
#         print(f"< Idle time features > {len(feats.columns)}")
        feats = feats.join(self.create_p_bursts_feats(), on='id', how='left') # 110 columns
#         print(f"< P-bursts features > {len(feats.columns)}")
        feats = feats.join(self.create_r_bursts_feats() , on='id', how='left') # 116 columns
#         print(f"< R-bursts features > {len(feats.columns)}")        
        return feats # 116 features


In [15]:
fe = FeatureExtractor(train_logs)
train_feats = fe.create_feats() # Extract features from trainning logs (polars df)
train_feats = train_feats.collect().to_pandas() # Convert polars df to pandas df
train_feats.to_csv("train_feats_0.csv")
# print(train_feats.describe())
train_logs = train_logs.collect().to_pandas()  # Convert polars df to pandas df

del fe

shape: (5, 52)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ id       ┆ activity_ ┆ activity_ ┆ activity_ ┆ … ┆ up_event_ ┆ up_event_ ┆ up_event_ ┆ up_event_ │
│ ---      ┆ 0_cnt     ┆ 1_cnt     ┆ 2_cnt     ┆   ┆ 12_cnt    ┆ 13_cnt    ┆ 14_cnt    ┆ 15_cnt    │
│ str      ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ u32       ┆ u32       ┆ u32       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ u32       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 001519c8 ┆ 2010      ┆ 417       ┆ 120       ┆ … ┆ 0         ┆ 3         ┆ 0         ┆ 0         │
│ 0022f953 ┆ 1938      ┆ 260       ┆ 254       ┆ … ┆ 0         ┆ 3         ┆ 0         ┆ 0         │
│ 0042269b ┆ 3515      ┆ 439       ┆ 175       ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│ 0059420b ┆ 1304      ┆ 151       ┆ 99        ┆ … ┆ 2         ┆ 2         ┆

In [16]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

class EssayConstructor():
    def __init__(self, logs):
        self.logs = logs
        self.train_essays = self.get_train_essays(self.logs)
        self.AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']
    
    # Get the essay from train logs 
    def get_train_essays(self, logs):
        def reconstruct_essay(currTextInput):
            essayText = ""
            for Input in currTextInput.values:
                if Input[0] == 'Replace':
                    replaceTxt = Input[2].split(' => ')
                    essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                    continue
                if Input[0] == 'Paste':
                    essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                    continue
                if Input[0] == 'Remove/Cut':
                    essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                    continue
                if "M" in Input[0]:
                    croppedTxt = Input[0][10:]
                    splitTxt = croppedTxt.split(' To ')
                    valueArr = [item.split(', ') for item in splitTxt]
                    moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
                    if moveData[0] != moveData[2]:
                        if moveData[0] < moveData[2]:
                            essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                        else:
                            essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                    continue
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            return essayText
        
        # Filter logs 
        df = logs[logs.activity != 'Nonproduction']
        group_df = df.groupby('id').apply(lambda x: reconstruct_essay(x[['activity', 'cursor_position', 'text_change']]))
        essay_df = pd.DataFrame({'id': df['id'].unique().tolist()})
        essay_df = essay_df.merge(group_df.rename('essay'), on='id')
        return essay_df

    # Create word level features from train essay
    def create_word_feats(self):
        df = self.train_essays.copy()
        df['word'] = df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
        df = df.explode('word')
        df['word_len'] = df['word'].apply(lambda x: len(x))
        df = df[df['word_len'] != 0] # Remove all the no-word record
        # Aggregate word level features
        word_agg_df = df[['id','word_len']].groupby(['id']).agg(self.AGGREGATIONS)
        word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
        word_agg_df['id'] = word_agg_df.index
        word_agg_df = word_agg_df.reset_index(drop=True)
        return word_agg_df
    # Create sentence level features
    def create_sentence_feats(self):
        df = self.train_essays.copy()
        df['sent'] = df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
        df = df.explode('sent')
        df['sent'] = df['sent'].apply(lambda x: x.replace('\n','').strip())
        # Number of characters in sentences
        df['sent_len'] = df['sent'].apply(lambda x: len(x))
        # Number of words in sentences
        df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
        df = df[df.sent_len!=0].reset_index(drop=True)
        # Aggregate sentence level features
        sent_agg_df = pd.concat([df[['id','sent_len']].groupby(['id']).agg(self.AGGREGATIONS), 
                                 df[['id','sent_word_count']].groupby(['id']).agg(self.AGGREGATIONS)], axis=1)
        sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
        sent_agg_df['id'] = sent_agg_df.index
        sent_agg_df = sent_agg_df.reset_index(drop=True)
        sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
        sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
        return sent_agg_df
    # Create paragraph level features
    def create_paragraph_feats(self):
        df = self.train_essays.copy()
        df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
        df = df.explode('paragraph')
        # Number of characters in paragraphs
        df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x)) 
        # Number of words in paragraphs
        df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.split(' ')))
        df = df[df.paragraph_len!=0].reset_index(drop=True)
        # Aggregate paragraph level features
        paragraph_agg_df = pd.concat([df[['id','paragraph_len']].groupby(['id']).agg(self.AGGREGATIONS), 
                                      df[['id','paragraph_word_count']].groupby(['id']).agg(self.AGGREGATIONS)], axis=1) 
        paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
        paragraph_agg_df['id'] = paragraph_agg_df.index
        paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
        paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
        paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
        return paragraph_agg_df

    
    # Create product to keys features
    def create_product_to_keys_feats(self):
        essays = self.train_essays.copy()
        logs = self.logs.copy()
        essays['product_len'] = essays['essay'].str.len()
        tmp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg({'activity': 'count'}).reset_index().rename(columns={'activity': 'keys_pressed'})
        essays = essays.merge(tmp_df, on='id', how='left')
        essays['product_to_keys'] = essays['product_len'] / essays['keys_pressed']
        return essays[['id', 'product_to_keys']]

    # Create key pressed features
    def create_keys_pressed_feats(self):
        logs = self.logs.copy()
        temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
        temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
        temp_df = temp_df.merge(temp_df_2, on='id', how='left')
        temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
        return temp_df[['id', 'keys_per_second']]
    
    def create_feats(self, feats):
        feats = feats.merge(self.create_word_feats(), on='id', how='left') # 126 columns in total
#         print(f"{len(feats.columns)}")
        feats = feats.merge(self.create_sentence_feats(), on='id', how='left') # 145 columns
#         print(f"{len(feats.columns)}")
        feats = feats.merge(self.create_paragraph_feats(), on='id', how='left') # 164 columns
#         print(f"{len(feats.columns)}")
        feats = feats.merge(self.create_keys_pressed_feats(), on='id', how='left') # 166 columns
#         print(f"{len(feats.columns)}")
        feats = feats.merge(self.create_product_to_keys_feats(), on='id', how='left') # 165 columns
#         print(f"{len(feats.columns)}")
        return feats

In [17]:

print('< Essay Reconstruction >')
ec = EssayConstructor(train_logs)
train_feats = ec.create_feats(train_feats)
# Writing to csg
train_feats.to_csv("train_feats.csv")
del ec


< Essay Reconstruction >


In [18]:
train_scores  = pd.read_csv(data_path + 'train_scores.csv')
ess = get_essay_df(get_other_moments(get_essay_moments(process_essays(train_logs, train_scores))), True)
# Merge train features and train scores as training data
train_data = train_feats.merge(train_scores, on='id', how='left')

['001519c8' '0022f953' '0042269b' ... 'ffccd6fd' 'ffec5b38' 'fff05981']
001519c8
0022f953
0042269b
0059420b
0075873a
0081af50
0093f095
009e23ab
00e048f1
00e1f05a
00e713bd
00f0737e
00f8e84c
00fc9a6a
0144e4d5
014e7ae9
015aa732
0178a105
0182aa1c
0190ff4c
01963e20
019737b6
01992d32
01c359fc
01d0ba4b
01d602a7
0249a095
0262bf61
026be946
0294b4f5
02a41d1a
02cf6a52
02d3c9fc
02e86b6a
031c0c58
034d61db
035f09fc
036cfd42
0395b217
03971ddf
040c429b
0417d421
0432f117
0445b534
044b274d
044c5c54
048fd254
04b88b3e
04c5f885
04e37b6b
04e63aec
04f83ccb
052a7811
052b25e1
052cf3a6
053be4e5
05624ab0
056c41fc
05b35fa0
05f425a4
05f8ca37
06336d82
064a82e9
0657fcde
0666fb4e
069d668b
06cba5d7
06e59db8
06f33221
073b3535
075f92a5
077ee03e
078a6196
07bb2245
07bb7765
07c36ae1
07e41499
07f3409c
081d84f4
08276ea7
08288a1f
08390d49
088efb89
08dbdc95
08e3c7ae
08e82426
08eb331a
08ed1a7f
08f461ac
08f9b53a
0916cdad
09429577
0949205b
098c4b58
098f7878
09a67581
09bf7971
09ea6d33
09eb3ce5
0a1be27b
0a21bbd6
0a248634
0a31d60e
0

In [19]:
print(train_data)

            id  activity_0_cnt  activity_1_cnt  activity_2_cnt  \
0     001519c8            2010             417             120   
1     0022f953            1938             260             254   
2     0042269b            3515             439             175   
3     0059420b            1304             151              99   
4     0075873a            1942             517              72   
...        ...             ...             ...             ...   
2466  ffb8c745            3588             960             189   
2467  ffbef7e5            2395              60             148   
2468  ffccd6fd            2849              88             126   
2469  ffec5b38            2895             276              71   
2470  fff05981            2452             310             843   

      activity_3_cnt  activity_4_cnt  text_change_0_cnt  text_change_1_cnt  \
0                  7               0               1940                436   
1                  1               1               

In [20]:
train_data_es = pd.concat([train_data, ess], axis=1).drop(['id', 'score'], axis=1)

In [21]:
data_Y = train_data['score'].values
train_data = train_data.drop(['id', 'score'], axis=1)

In [22]:
print(train_data)

      activity_0_cnt  activity_1_cnt  activity_2_cnt  activity_3_cnt  \
0               2010             417             120               7   
1               1938             260             254               1   
2               3515             439             175               7   
3               1304             151              99               1   
4               1942             517              72               0   
...              ...             ...             ...             ...   
2466            3588             960             189               2   
2467            2395              60             148               1   
2468            2849              88             126               0   
2469            2895             276              71               0   
2470            2452             310             843              12   

      activity_4_cnt  text_change_0_cnt  text_change_1_cnt  text_change_2_cnt  \
0                  0               1940               

In [23]:
def outlier_cnt_feat(df):
    cols = df.columns
    mn = {}
    st = {}
    for col in cols:
        mn[col] = df[col].mean()
        st[col] = df[col].std()
    def count_outliers(row):
        cnt = 0
        for col in cols:
            if st[col] == np.nan or col == 'id':
                continue
            if row[col] >= mn[col] + st[col] or row[col] <= mn[col] - st[col]:
                cnt += 1
        return cnt / len(cols)
    return count_outliers

def outlier_dst_feat(df):
    cols = df.columns
    mn = {}
    nrm = {}
    for col in cols:
        mn[col] = df[col].mean()
        mx = abs(df[col].max())
        mni = abs(df[col].min())
        if mx > mni:
            nrm[col] = mx
        else:
            nrm[col] = mni
    def dst_outliers(row):
        dst = 0
        num = 0
        for col in cols:
            if np.isnan(row[col]) or nrm[col] == 0 or col == 'id':
                continue
            d = (mn[col] - row[col]) / nrm[col]
            dst += d**2
            num += 1
        return dst / num
    return dst_outliers

In [24]:
train_data["outlier_cnt"] = train_data.apply(outlier_cnt_feat(train_data), axis=1)
train_data["outlier_dst"] = train_data.apply(outlier_dst_feat(train_data), axis=1)

train_data_es["outlier_cnt"] = train_data_es.apply(outlier_cnt_feat(train_data_es), axis=1)
train_data_es["outlier_dst"] = train_data_es.apply(outlier_dst_feat(train_data_es), axis=1)

In [25]:
print(train_data_es["outlier_dst"])

0       0.003848
1       0.003140
2       0.003081
3       0.005185
4       0.005471
          ...   
2466    0.006782
2467    0.009731
2468    0.039061
2469    0.002055
2470    0.005584
Name: outlier_dst, Length: 2471, dtype: float64


In [47]:
print('< Testing Data >')  # Load test data
test_logs = pl.scan_csv(data_path + 'test_logs.csv')
# Extract statistical features
fe = FeatureExtractor(test_logs)
test_feats = fe.create_feats() 
test_feats = test_feats.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
# Extract essay features
ec = EssayConstructor(test_logs)
test_feats = ec.create_feats(test_feats)
print("Nan columns of test data", test_feats.columns[test_feats.isna().any()].tolist()) # Check if any na in the test data

test_ids = test_feats['id'].values
test_x = test_feats.drop(['id'], axis=1)
#test_x_ess = get_essay_df(get_other_moments(get_essay_moments(process_essays(test_logs, None))), False).drop(['id'], axis=1)
test_x_ess = pd.concat([test_x_ess, test_x], axis=1)

test_x["outlier_cnt"] = test_x.apply(outlier_cnt_feat(test_x), axis=1)
test_x["outlier_dst"] = test_x.apply(outlier_dst_feat(test_x), axis=1)

test_x_ess["outlier_cnt"] = test_x_ess.apply(outlier_cnt_feat(test_x_ess), axis=1)
test_x_ess["outlier_dst"] = test_x_ess.apply(outlier_dst_feat(test_x_ess), axis=1)

< Testing Data >
shape: (3, 52)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ id       ┆ activity_ ┆ activity_ ┆ activity_ ┆ … ┆ up_event_ ┆ up_event_ ┆ up_event_ ┆ up_event_ │
│ ---      ┆ 0_cnt     ┆ 1_cnt     ┆ 2_cnt     ┆   ┆ 12_cnt    ┆ 13_cnt    ┆ 14_cnt    ┆ 15_cnt    │
│ str      ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ u32       ┆ u32       ┆ u32       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ u32       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0000aaaa ┆ 2         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│ 2222bbbb ┆ 2         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│ 4444cccc ┆ 2         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
└──────────┴───────────┴───────────┴───────────┴───┴───────

In [27]:
def get_outliers_and_bin(ess):
    ud = []
    out = []
    for val in ess:
        if val <= 5 and val >= 3:
            out.append(0)
        else:
            out.append(1)
        if val < 4.0:
            ud.append(0)
        else:
            ud.append(1)
        
    
    return (ud, out)

In [48]:
data_X_ess = pd.concat([train_data, ess], axis=1).drop(['id', 'score'], axis=1)
data_X = train_data

bin, out = get_outliers_and_bin(data_Y)
data_Ybin = np.array(bin)
data_Yout = np.array(out)

In [29]:
for col in data_X.columns:
    print(col)

activity_0_cnt
activity_1_cnt
activity_2_cnt
activity_3_cnt
activity_4_cnt
text_change_0_cnt
text_change_1_cnt
text_change_2_cnt
text_change_3_cnt
text_change_4_cnt
text_change_5_cnt
text_change_6_cnt
text_change_7_cnt
text_change_8_cnt
text_change_9_cnt
text_change_10_cnt
text_change_11_cnt
text_change_12_cnt
text_change_13_cnt
down_event_0_cnt
down_event_1_cnt
down_event_2_cnt
down_event_3_cnt
down_event_4_cnt
down_event_5_cnt
down_event_6_cnt
down_event_7_cnt
down_event_8_cnt
down_event_9_cnt
down_event_10_cnt
down_event_11_cnt
down_event_12_cnt
down_event_13_cnt
down_event_14_cnt
down_event_15_cnt
up_event_0_cnt
up_event_1_cnt
up_event_2_cnt
up_event_3_cnt
up_event_4_cnt
up_event_5_cnt
up_event_6_cnt
up_event_7_cnt
up_event_8_cnt
up_event_9_cnt
up_event_10_cnt
up_event_11_cnt
up_event_12_cnt
up_event_13_cnt
up_event_14_cnt
up_event_15_cnt
input_word_count
input_word_length_mean
input_word_length_max
input_word_length_std
input_word_length_median
input_word_length_skew
action_time_s

In [30]:
class ModelTrainer():
    def __init__(self, m_name, X, Y, **params):
        # Model
        self.model_name = m_name
        self.params = params
        self.create_model()
        
        self.X = X
        self.Y = Y
        print(f'Number of features: {len(self.X.columns)}')
        self.cls = False
        
    
    def make_pipeline(self, model):
        return Pipeline([
            ('remove_infs', FunctionTransformer(lambda x: np.nan_to_num(x, nan=np.nan, posinf=0, neginf=0))),
            ('imputer', SimpleImputer(strategy='mean')),
            ('normalizer', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
            ('scaler', RobustScaler()),
            ('model', model)
        ])
    
    # Create the model
    def create_model(self):
        
        match self.model_name:
            case "lgbm":
                self.model = LGBMRegressor(**self.params)
            case "lgbm_ess":
                self.model = LGBMRegressor(**self.params)
            case "xgb":
                self.model = XGBRegressor(**self.params)
            case "xgb_ess":
                self.model = XGBRegressor(**self.params)
            case "catboost":
                self.model = CatBoostRegressor(**self.params)
            case 'rfr':
                self.model = self.make_pipeline(RandomForestRegressor(**self.params))
            case 'rfr_ess':
                self.model = self.make_pipeline(RandomForestRegressor(**self.params))
            case "svr":
                self.model = self.make_pipeline(SVR(**self.params))
            case "svr_ess":
                self.model = self.make_pipeline(SVR(**self.params))
            case 'lasso':
                self.model = self.make_pipeline(Lasso(**self.params))
            case 'ridge':
                self.model = self.make_pipeline(Ridge(**self.params))
            case 'ridge_ess':
                self.model = self.make_pipeline(Ridge(**self.params))
            case "lgbmbin":
                self.model = LGBMClassifier(**self.params)
                self.cls = True
            case "lgbmout":
                self.model = LGBMClassifier(**self.params)
                self.cls = True
            case "lgbmout_ess":
                self.model = LGBMClassifier(**self.params)
                self.cls = True
            case "xgbbin":
                self.model = XGBClassifier(**self.params)
                self.cls = True
            case "xgbout":
                self.model = XGBClassifier(**self.params)
                self.cls = True
            case "xgbout_ess":
                self.model = XGBClassifier(**self.params)
                self.cls = True
            case "hgb":
                self.model = HistGradientBoostingRegressor(**self.params)
                self.cls = False
            case "hgb_ess":
                self.model = HistGradientBoostingRegressor(**self.params)
                self.cls = False
            case "hgbbin":
                self.model = HistGradientBoostingClassifier(**self.params)
                self.cls = True
            case "hgbout":
                self.model = HistGradientBoostingClassifier(**self.params)
                self.cls = True
            case "hgbout_ess":
                self.model = HistGradientBoostingClassifier(**self.params)
                self.cls = True
            case "ada":
                self.model = self.make_pipeline(AdaBoostRegressor(**self.params))
                self.cls = False
            case "ada_ess":
                self.model = self.make_pipeline(AdaBoostRegressor(**self.params))
                self.cls = False
            case "adabin":
                self.model = self.make_pipeline(AdaBoostClassifier(**self.params))
                self.cls = True
            case "adaout":
                self.model = self.make_pipeline(AdaBoostClassifier(**self.params))
                self.cls = True
            case "adaout_ess":
                self.model = self.make_pipeline(AdaBoostClassifier(**self.params))
                self.cls = True
            case "etc":
                self.model = self.make_pipeline(ExtraTreesRegressor(**self.params))
                self.cls = False
            case "etc_ess":
                self.model = self.make_pipeline(ExtraTreesRegressor(**self.params))
                self.cls = False
            case "etcbin":
                self.model = self.make_pipeline(ExtraTreesClassifier(**self.params))
                self.cls = True
            case "etcout":
                self.model = self.make_pipeline(ExtraTreesClassifier(**self.params))
                self.cls = True
            case "etcout_ess":
                self.model = self.make_pipeline(ExtraTreesClassifier(**self.params))
                self.cls = True

            case "bagbin":
                self.model = self.make_pipeline(BaggingClassifier(**self.params))
                self.cls = True
            case "bagout":
                self.model = self.make_pipeline(BaggingClassifier(**self.params))
                self.cls = True
            
            case "catboostbin":
                self.model = CatBoostClassifier(**self.params)
                self.cls = True
            case "catboostout":
                self.model = CatBoostClassifier(**self.params)
                self.cls = True
            case "rfrbin":
                self.model = self.make_pipeline(RandomForestClassifier(**self.params))
                self.cls = True
            case "rfrout":
                self.model = self.make_pipeline(RandomForestClassifier(**self.params))
                self.cls = True
            case "rfrout_ess":
                self.model = self.make_pipeline(RandomForestClassifier(**self.params))
                self.cls = True
            case "gnbbin":
                self.model = self.make_pipeline(GaussianNB(**self.params))
                self.cls = True
            case "gnbout":
                self.model = self.make_pipeline(GaussianNB(**self.params))
                self.cls = True
            case "gnbout_ess":
                self.model = self.make_pipeline(GaussianNB(**self.params))
                self.cls = True
            case "svcbin":
                self.model = self.make_pipeline(SVC(**self.params))
                self.cls = True
            case "svcout":
                self.model = self.make_pipeline(SVC(**self.params))
                self.cls = True
            case "svcout_ess":
                self.model = self.make_pipeline(SVC(**self.params))
                self.cls = True
            case "rdgbin":
                self.model = self.make_pipeline(RidgeClassifier(**self.params))
                self.cls = True
            case "rdgout":
                self.model = self.make_pipeline(RidgeClassifier(**self.params))
                self.cls = True
            case "rdgout_ess":
                self.model = self.make_pipeline(RidgeClassifier(**self.params))
                self.cls = True
            
                
            case other:
                print("Not implemented")
                sys.exit(-1)
    
    # Get the trained model        
    def get_model(self):
        return self.model
        
    # Train the model with 5-fold CV
    def train_model(self):        
        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        verbose_callback = lgb.log_evaluation(100)        
        # Split the training data into 5 fold
        skf = StratifiedKFold(n_splits=N_FOLD, random_state=SEED, shuffle=True)
        fold_rmses = []
        for fold, (train_index, valid_index) in enumerate(skf.split(self.X, self.Y.astype(str))):
            train_x = self.X.iloc[train_index]
            train_y = self.Y[train_index]
            valid_x = self.X.iloc[valid_index]
            valid_y = self.Y[valid_index]
            if self.model_name == 'lgbm':
                # Train the model with early stop of 100 
                self.model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
                          callbacks=[
                                lgb.callback.early_stopping(stopping_rounds=100),
                                lgb.callback.log_evaluation(period=100),
                          ])  
            else:
                # Fit the model with train x and train y
                self.model.fit(train_x, train_y)            
            predictions = self.model.predict(valid_x)
            if not self.cls:
                rmse = mean_squared_error(y_true=valid_y, y_pred=predictions, squared=False) # Return RMSE
            else:
                rmse = 1 - accuracy_score(valid_y, predictions)
            fold_rmses.append(rmse)
        avg_rmse = np.mean(fold_rmses)
        print(f"Average rmse: {avg_rmse}") 
        return avg_rmse
    
    # Evaluate the model with entire X data
    def evaluation(self):
        preds = self.predict(self.X)
        if not self.cls:
            rmse = mean_squared_error(y_true=self.Y, y_pred=preds, squared=False)
        else:
            rmse = 1 - accuracy_score(valid_y, predictions)
        return rmse

    def vote(self, preds):
        vt = []
        for ix in range(len(test_x)):
            sm = preds[ix, :].sum()
            if sm > N_FOLD / 2:
                vt.append(1)
            else:
                vt.append(0)
        return vt
        
    # Predict the test data. 
    def predict(self, test_x):
        # Prediction loop
        tests_y = np.zeros((len(test_x), N_FOLD))
        for fold in range(N_FOLD):
            preds = self.model.predict(test_x)
            tests_y[:, fold] = preds
            #print(f"Fold = {fold} Prediction = {preds[:5]}")
        if not self.cls:
            test_y = np.mean(tests_y, axis=1)
        else:
            test_y = self.vote(tests_y)
        return test_y# Average the prediction of each fold model
    
    # Clear the memory
    def clear_memory(self):
        del self.model
        libc.malloc_trim(0)
        torch.cuda.empty_cache()
        gc.collect()

In [31]:
params_dict = {}
params_dict["ridge"] = {'alpha': 0.07788883556218609, 'random_state': 42, 'solver': 'auto'}
params_dict["svr"] = {'kernel': 'rbf', 'C': 1.0, 'epsilon': 0.2668506195115871}
params_dict["hgb"] = {'learning_rate': 0.012153706444287133, 'max_iter': 605, 'max_leaf_nodes': 20, 'l2_regularization': 0.00047211777048909563}
params_dict["lgbm"] = {'n_estimators': 1024, 'learning_rate': 0.004174925348929063, 'metric': 'rmse', 'random_state': 42, 'force_col_wise': True, 'verbosity': -1, 'reg_alpha': 0.15639029398585033, 'reg_lambda': 2.0055427457193318, 'colsample_bytree': 0.7190650020820821, 'subsample': 0.8288906555039297, 'num_leaves': 16, 'min_child_samples': 60}
params_dict["xgb"] = {'max_depth': 3, 'learning_rate': 0.06532993368193832, 'objective': 'reg:squarederror', 'num_estimators': 1000, 'num_boost_round': 1000, 'eval_metric': 'rmse', 'seed': 42}

params_dict["ridge_ess"] = {'alpha': 82.31238966134191}
params_dict["svr_ess"] = {'epsilon': 0.2668506195115871}
params_dict["hgb_ess"]  = {'learning_rate': 0.012153706444287133, 'max_iter': 605, 'max_leaf_nodes': 20, 'l2_regularization': 0.00047211777048909563}
params_dict["lgbm_ess"] = {'n_estimators': 1024, 'learning_rate': 0.004174925348929063, 'metric': 'rmse', 'random_state': 42, 'force_col_wise': True, 'verbosity': -1, 'reg_alpha': 0.15639029398585033, 'reg_lambda': 2.0055427457193318, 'colsample_bytree': 0.7190650020820821, 'subsample': 0.8288906555039297, 'num_leaves': 16, 'min_child_samples': 60}
params_dict["xgb_ess"] = {'learning_rate': 0.06532993368193832, 'max_depth': 3}

params_dict["svcout"] = {'C': 2.211061088762589}
params_dict["lgbmout"] = {'learning_rate': 0.024625577176321033, 'reg_alpha': 5.8978130838530225, 'reg_lambda': 0.03427655135355339, 'colsample_bytree': 0.9063730206067478, 'subsample': 0.9759138794788889, 'num_leaves': 39, 'min_child_samples': 47}
params_dict["rfrout"] = {'n_estimators': 133, 'max_depth': 29, 'min_samples_split': 3, 'min_samples_leaf': 1}
params_dict["rdgout"] = {'alpha': 0.016457562460504893}
params_dict["xgbout"] = {'learning_rate': 0.048758238969011644, 'max_depth': 8}
params_dict["adaout"] = {'n_estimators': 280, 'learning_rate': 0.028933209509402234}
params_dict["hgbout"] = {'learning_rate': 0.1518453444798331, 'max_iter': 373, 'max_leaf_nodes': 458, 'l2_regularization': 496.5783250752291}
params_dict["etcout"] = {'bootstrap': True, 'n_estimators': 549, 'max_leaf_nodes': 371}

params_dict["svcout_ess"] = {'C': 1.0023380549077192}
params_dict["lgbmout_ess"] = {'learning_rate': 0.03295324868964636, 'reg_alpha': 0.006427232379685914, 'reg_lambda': 0.15765270234306666, 'colsample_bytree': 0.5091457731868475, 'subsample': 0.9924438213102255, 'num_leaves': 48, 'min_child_samples': 71}
params_dict["rfrout_ess"] = {'n_estimators': 1128, 'max_depth': 29, 'min_samples_split': 13, 'min_samples_leaf': 1}
params_dict["rdgout_ess"] = {'alpha': 2.0406222227198856}
params_dict["xgbout_ess"] = {'learning_rate': 0.04285879636610555, 'max_depth': 50}
params_dict["adaout_ess"] = {'n_estimators': 255, 'learning_rate': 0.014128177778636274}
params_dict["hgbout_ess"] = {'learning_rate': 0.05383757156885253, 'max_iter': 881, 'max_leaf_nodes': 246, 'l2_regularization': 0.0014656646249894979}
params_dict["etcout_ess"] = {'n_estimators': 284, 'max_leaf_nodes': 392}

In [32]:
best_score = 1.0
# Find the optimal learning rate
def objective(trial, m_name, X, Y):
    global params_dict
    # Parameters
    params = params_dict[m_name] # Load the default parameters
    # set the trial for tunable parameters
    if m_name == 'xgb' or m_name == 'xgbbin' or m_name == 'xgbout' or m_name == 'xgbout_ess' or m_name == 'xgb_ess':
        # Parameters for 'xgb' model
        params['learning_rate'] = trial.suggest_loguniform('learning_rate', 1e-4, 0.5)
        params['max_depth'] = trial.suggest_int('max_depth', 2, 64)
    elif m_name == 'catboost' or m_name == 'catboostbin' or m_name == 'catboostout':
        params['depth'] = trial.suggest_int('depth', 2, 16)
    elif m_name == 'svr' or m_name == 'svr_ess':
        params['epsilon'] = trial.suggest_float('epsilon', 0.01, 1)
    elif m_name == "svcbin" or m_name == "svcout" or m_name == "svcout_ess":
        params["C"] = trial.suggest_loguniform("C", 1e-6, 1e6)
    elif m_name == 'ridge' or m_name == 'rdgbin' or m_name == 'rdgout' or m_name == 'ridge_ess' or m_name == 'rdgout_ess':
        params['alpha'] = trial.suggest_loguniform('alpha', 1e-3, 100.0)
    elif m_name == 'lgbm' or m_name == 'lgbmbin' or m_name == 'lgbmout' or m_name == 'lgbmout_ess' or m_name == 'lgbm_ess':
        params['learning_rate'] = trial.suggest_loguniform('learning_rate', 1e-4, 0.5)
        params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-3, 10.0)
        params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-3, 10.0)
        params['colsample_bytree'] = trial.suggest_float('colsample_bytree', 0.5, 1)
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1)
        params['num_leaves'] = trial.suggest_int('num_leaves', 8, 64)
        params['min_child_samples'] = trial.suggest_int('min_child_samples', 1, 100)
    elif m_name == "gnbbin" or m_name == "gnbout" or m_name == "gnbout_ess":
        params["var_smoothing"] = trial.suggest_loguniform("var_smoothing", 1e-9, 1e0)
    elif m_name == "rfr" or m_name == "rfrbin" or m_name == "rfrout"  or m_name == "rfrout_ess"  or m_name == "rfr_ess":
        params["n_estimators"] = trial.suggest_int("n_estimators", 100, 1500)
        params["max_depth"] = trial.suggest_int("max_depth", 3, 30)
        params["min_samples_split"] = trial.suggest_int("min_samples_split", 2, 25)
        params["min_samples_leaf"] = trial.suggest_int("min_samples_leaf", 1, 10)

    elif m_name == "adabin" or m_name == "adaout" or m_name == "ada"  or m_name == "ada_ess"  or m_name == "adaout_ess":
        params["n_estimators"] = trial.suggest_int("n_estimators", 50, 500)
        params["learning_rate"] = trial.suggest_loguniform("learning_rate", 1e-6, 1e6)

    elif m_name == "hgbbin" or m_name == "hgbout" or m_name == "hgb"  or m_name == "hgb_ess"  or m_name == "hgbout_ess":
        params["learning_rate"] = trial.suggest_loguniform("learning_rate", 1e-4, 1)
        params["max_iter"] = trial.suggest_int("max_iter", 50, 1000)
        params["max_leaf_nodes"] = trial.suggest_int("max_leaf_nodes", 10, 500)
        params["l2_regularization"] = trial.suggest_loguniform("l2_regularization", 1e-6, 1e6)

    elif m_name == "etcbin" or m_name == "etcout" or m_name == "etc"  or m_name == "etc_ess"  or m_name == "etcout_ess":
        params["n_estimators"] = trial.suggest_int("n_estimators", 20, 1000)
        params["max_leaf_nodes"] = trial.suggest_int("max_leaf_nodes", 2, 500)

    elif m_name == "bagbin" or m_name == "bagout":
        params["n_estimators"] = trial.suggest_int("n_estimators", 20, 1000)
        
    # Experiment the parameters
    trainer = ModelTrainer(m_name, X, Y, **params)
    avg_score = trainer.train_model()
    # Save the model is the avg score > current best score
    global best_score
    if avg_score < best_score:
        best_score = avg_score
    # Clean up
    trainer.clear_memory()
    del trainer    
    print(f"Average result {avg_score} and the best score {best_score}")
    return avg_score

def run_optuna(m_name, X, Y):
    study_name = f"{m_name}_study"
    study_file_path = f"{study_name}.db"
    if os.path.exists(study_file_path):
        os.remove(study_file_path)
    # # Create a study to find the optimal hyper-parameters    
    study = optuna.create_study(direction="minimize", study_name=study_name,
                                storage="sqlite:///" + f"{study_file_path}", # Storage path of the database keeping the study results
                                load_if_exists=False)  # Resume the existing study
    # Set up the timeout to avoid runing out of quote
    # n_jobs =-1 is CPU bounded
    if m_name[:3] != "xgb":
        study.optimize(lambda trial: objective(trial, m_name, X, Y), 
                   n_jobs=4, n_trials=15,
                   show_progress_bar=True, gc_after_trial=True)
    else:
        study.optimize(lambda trial: objective(trial, m_name, X, Y), 
                   n_jobs=4, n_trials=10,
                   show_progress_bar=True, gc_after_trial=True)
    ## Print the best parameters    
    best_trial = study.best_trial
    best_params = study.best_params
    # Print out the experiment results
    print(f"Best parameters: {best_params}\n\n"
          f"Number of finished trials: {len(study.trials)}\n\n"
          f"Best trial:{best_trial}")    
    return study

In [33]:
def train_model(m_name, X, Y, is_loaded=True, aug=False):
    best_params = params_dict[m_name]
    # If is_loaded is True, load the best parameters.
    # Otherwise, initiate an Optuna study to optimize parameters.
    if is_loaded:  # Loaded the best parameters that are found from previous experiments
        #study_name = f"{model_name}_study"
        #study_file_path = f"{study_name}.db"
        #if os.path.isfile(study_file_path):
        #    loaded_study = optuna.load_study(study_name=study_name,
        #                                 storage="sqlite:///" + f"{study_file_path}")
        #    best_params.update(loaded_study.best_params)
        #    print(f"Best parameters: {best_params}\n\n")
        pass
    else:
        study = run_optuna(m_name, X, Y)
        best_params.update(study.best_params)
        # Print out the experiment results
        print(f"Best parameters: {best_params}\n\n")
    ## Parameters for LGBMRegressor model
    print(best_params)
    print(m_name)
    trainer = ModelTrainer(m_name, X, Y, **best_params)
    trainer.train_model()
    rmse = trainer.evaluation()
    model = trainer.get_model()
    print(f"Complete training {m_name} RMSE = {rmse}")
    return model

# Collect all the models
models_r = []
models_b = []
#model_names = ["ridge", "svr", "hgb", 'catboost', 'lgbm', 'xgb', 'rfr', 'etc'] # 5 models
model_names = ["ridge", "svr", 'hgb', 'lgbm', 'xgb']
model_names_ess = ["ridge_ess", "svr_ess", 'hgb_ess', 'lgbm_ess', 'xgb_ess']
#model_names = ["ridge_ess", "svr_ess", 'catboost', 'lgbm', 'xgb']
model_bin = ["svcbin", "rdgbin", "xgbbin", "adabin", "hgbbin", "etcbin",  "lgbmbin", "rfrbin"]
model_out = ["svcout", "lgbmout", "rfrout", "rdgout", "xgbout", "adaout", "hgbout", "etcout"]
model_out_ess = ["svcout_ess", "lgbmout_ess", "rfrout_ess", "rdgout_ess", "xgbout_ess", "adaout_ess", "hgbout_ess", "etcout_ess"]
out_proc = ["lgbmout",  "rfrout", "svcout", "rdgout", "xgbout", "gnbout"]
bin_proc = ["catboostbin"]
out_proc_r = model_names
# model_names = ['lasso', 'ridge', 'rfr', 'svr', 'catboost', 'lgbm', 'xgb'] # 7 models
preds_y = []
tests_y = []


In [34]:
def init_models(mdls, X, Y, is_loaded=True):
    models = []
    for m_name in mdls:
        print(m_name)
        if m_name == "catboost":
            model = train_model(m_name, X, Y,True)
        else:
            model = train_model(m_name, X, Y, is_loaded)
        models.append((m_name, model))
    return models

In [35]:
out_mdls = init_models(model_out, data_X, data_Yout, is_loaded=True)
#bin_mdls = init_models(model_bin, data_X, data_Ybin)

svcout
{'C': 2.211061088762589}
svcout
Number of features: 167
Average rmse: 0.42379025663552516
Complete training svcout RMSE = 0.3317783777562672
lgbmout
{'learning_rate': 0.024625577176321033, 'reg_alpha': 5.8978130838530225, 'reg_lambda': 0.03427655135355339, 'colsample_bytree': 0.9063730206067478, 'subsample': 0.9759138794788889, 'num_leaves': 39, 'min_child_samples': 47}
lgbmout
Number of features: 167
[LightGBM] [Info] Number of positive: 453, number of negative: 1523
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26749
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 162
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.229251 -> initscore=-1.212545
[LightGBM] [Info] Start training from score -1.212545
[LightGBM] [Info] Number of positive: 454, number of negative: 1523
[LightGBM] [Info] Auto-choos

In [49]:
data_X_ess = data_X_ess
out_mdls_ess = init_models(model_out_ess, data_X_ess, data_Yout, is_loaded=True)
#bin_mdls_ess = init_models(model_bin, data_X_ess, data_Ybin)

svcout_ess
{'C': 1.0023380549077192}
svcout_ess
Number of features: 280
Average rmse: 0.4252860732113371
Complete training svcout_ess RMSE = 0.35647448588277747
lgbmout_ess
{'learning_rate': 0.03295324868964636, 'reg_alpha': 0.006427232379685914, 'reg_lambda': 0.15765270234306666, 'colsample_bytree': 0.5091457731868475, 'subsample': 0.9924438213102255, 'num_leaves': 48, 'min_child_samples': 71}
lgbmout_ess
Number of features: 280
[LightGBM] [Info] Number of positive: 453, number of negative: 1523
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55226
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 274
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.229251 -> initscore=-1.212545
[LightGBM] [Info] Start training from score -1.212545
[LightGBM] [Info] Number of positive: 454, number of negative: 1523
[Light

In [38]:
def evaluate_models(models, X, Y, seed, cls=False):
    # split the full train data (data_X and data_Y) into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, Y,
                                                      test_size=0.2, random_state=seed)
    # fit and evaluate the models
    weights = list()
    for name, model in models:
        # fit the model
        model.fit(X_train, y_train)
        # evaluate the model
        y_preds = model.predict(X_val)
        # Calculate the 
        if not cls:
            rmse = mean_squared_error(y_true=y_val, y_pred=y_preds, squared=False)
        else:
            rmse = 1 - accuracy_score(y_val, y_preds)
        # store the performance
        weights.append(rmse)
    # report model performance
    print(f"Weight = {weights}")
    return (weights, X_train, X_val, y_train, y_val)

In [39]:
def voting(models, Xtr, target, cls, seed, Ytr = None, Xts=None, ):
    print(models)
    try:
        for md in models:
            print(md[0])
        
        if target is not None:
            weights, Xtr, Xts, Ytr, Yts = evaluate_models(models, Xtr, target, seed, cls=cls)
        else:
            weights = evaluate_models(models, Xtr, Ytr, seed, cls)[0]
        # Use the weights (scores) as a weighting for the ensemble
        if cls:
            ensemble = VotingClassifier(estimators=models, weights=weights)
        else:
            ensemble = VotingRegressor(estimators=models, weights=weights)
        ensemble.fit(Xtr, Ytr)
        test_y = ensemble.predict(Xts)
        if target is None:
            return test_y
        if not cls:
            sc = mean_squared_error(y_true=Yts, y_pred=test_y, squared=False)
        else:
            sc = accuracy_score(Yts, test_y)
        print(sc)
        return (models, sc, test_y, Yts)
    except Exception as e: 
        print(e)

In [40]:
def get_vote_preds(models, weights, X, Y, Xts):
        
    # Use the weights (scores) as a weighting for the ensemble
    ensemble = VotingClassifier(estimators=models, weights=weights)
    ensemble.fit(X, Y)
    test_y = ensemble.predict(Xts)
    return test_y

In [41]:
def get_out(model, X, Y, Xts):
    model[1].fit(X, Y)
    return model[1].predict(Xts)

In [42]:
def five_fold(models, weights, X, Y):
    outs = []
    ix = 0
    i_preds = []
    while ix < 5:
        print(ix)
        lwr = int(ix * 0.2 * len(X))
        upr = int((ix+1) * 0.2 * len(X))
        print(upr)
        if ix == 0:
            Xtr = X[upr:]
            Ytr = Y[upr:]

            Xts = X[:upr]
            Yts = Y[:upr]
        elif ix == 4:
            Xtr = X[:lwr]
            Ytr = Y[:lwr]

            Xts = X[lwr:]
            Yts = Y[lwr:]

        else:
            Xtr = pd.concat([X[:lwr], X[upr:]])
            Ytr = np.concatenate([Y[:lwr], Y[upr:]])

            Xts = X[lwr:upr]
            Yts = Y[lwr:upr]
        Ypred = get_vote_preds(models, weights, Xtr, Ytr, Xts)
        for m in range(len(models)):
            if ix == 0:
                i_preds.append(np.array(get_out(models[m], Xtr, Ytr, Xts)))
            else:
                i_preds[m] = np.append(i_preds[m], np.array(get_out(models[m], Xtr, Ytr, Xts)))
        sc = accuracy_score(Ypred, Yts)
        print(sc)
        outs.append(Ypred)
        ix += 1
    out_preds = []
    for val in range(len(models)):
        out_preds.append((models[val][0], i_preds[val]))
    return (np.concatenate(outs), out_preds)

In [43]:
def real_mode(models, weights,Xtr, Ytr, Xts):
    i_preds = []
    Ypred = get_vote_preds(models, weights, Xtr, Ytr, Xts)
    for m in range(len(models)):
        i_preds.append(np.array(get_out(models[m], Xtr, Ytr, Xts)))
    out_preds = []
    for val in range(len(models)):
        out_preds.append((models[val][0], i_preds[val]))
    return (Ypred, out_preds)

In [53]:
print(data_X_ess)

      activity_0_cnt  activity_1_cnt  activity_2_cnt  activity_3_cnt  \
0               2010             417             120               7   
1               1938             260             254               1   
2               3515             439             175               7   
3               1304             151              99               1   
4               1942             517              72               0   
...              ...             ...             ...             ...   
2466            3588             960             189               2   
2467            2395              60             148               1   
2468            2849              88             126               0   
2469            2895             276              71               0   
2470            2452             310             843              12   

      activity_4_cnt  text_change_0_cnt  text_change_1_cnt  text_change_2_cnt  \
0                  0               1940               

In [67]:
out_wt = [0.4311131937885755, 0.4168181542799854, 0.4357740124181224, 0.44494920831460977, 0.42876378559573314, 0.4119429204355498, 0.4494665749754947, 0.4168181542799854, 0.4334498677803327]
out1, outs1 = five_fold(out_mdls, out_wt[:-1], data_X, data_Yout)
out2, outs2 = five_fold(out_mdls_ess, out_wt[:-1], data_X_ess, data_Yout)
test_x_ess = test_x_ess[test_x_ess.columns.sort_values()]
data_X_ess = data_X_ess[data_X_ess.columns.sort_values()]

test_x = test_x[test_x.columns.sort_values()]
data_X = data_X[data_X.columns.sort_values()]
if not DEBUG:
    test_yout_ess, outs_y_ess = real_mode(out_mdls_ess, out_wt[:-1], data_X_ess, data_Yout, test_x_ess)
    test_yout, outs_y = real_mode(out_mdls, out_wt[:-1], data_X, data_Yout, test_x)

    out1 = np.append(out1, test_yout)
    out2 = np.append(out2, test_yout_ess)

    outs1 = outs1 + outs_y
    outs2 = outs2 + outs_y_ess

0
494
[LightGBM] [Info] Number of positive: 457, number of negative: 1520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26710
[LightGBM] [Info] Number of data points in the train set: 1977, number of used features: 162
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.231158 -> initscore=-1.201782
[LightGBM] [Info] Start training from score -1.201782
[LightGBM] [Info] Number of positive: 457, number of negative: 1520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26710
[LightGBM] [Info] Number of data points in the train set: 1977, number of used features: 162
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.231158 -> initscore=-1.201782
[LightGBM] [Info] Start training from score -1.201782
0.84412955

In [66]:
data_X_ess.columns.sort_values()

Index(['10s_backspace_ent', '10s_backspace_kurt', '10s_backspace_mean',
       '10s_backspace_skew', '10s_backspace_std', '10s_ent',
       '10s_input_rat_ent', '10s_input_rat_kurt', '10s_input_rat_mean',
       '10s_input_rat_skew',
       ...
       'word_len_first', 'word_len_last', 'word_len_max', 'word_len_mean',
       'word_len_median', 'word_len_min', 'word_len_q1', 'word_len_q3',
       'word_len_sum', 'word_sentence_rat'],
      dtype='object', length=280)

In [68]:
def majority(votes):
    majority = []
    num = len(votes)
    for ix in range(len(votes[0][1])):
        vt = 0
        for vote in votes:
            vt += vote[1][ix]
        if vt > num / 2:
            majority.append(1)
        else:
            majority.append(0)
    return majority

In [69]:
out3 = outs1 + outs2

In [None]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [None]:
pset = powerset(out3)

In [None]:
#opts = []
#for p in pset:
#    if len(p) > 1 and len(p) % 2 == 1:
#        opts.append(p)

In [None]:
#ranks = []
#for opt in opts:
#    s = (1-accuracy_score(majority(opt), data_Yout), opt)
#    ranks.append(s)
#    print(s[0])

In [70]:
def get_probs(preds):
    probs = []
    num = len(preds)
    for ix in range(len(preds[0])):
        prob = 0
        for pred in preds:
            prob += pred[ix]
        probs.append(prob / num)
    return probs

In [71]:
def get_preds(model):
    
    preds = []
    algos = []
    for item in model:
        print(item[1])
        preds.append(item[1])
        algos.append(item[0])
    return (preds, algos)

In [72]:
preds, algos = get_preds(out3)

[0 0 0 ... 0 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 0 0]
[1 1 1]
[1 1 1]
[1 1 1]
[1 1 1]
[1 1 1]
[1 1 1]
[0 1 1]
[1 1 1]
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]
[0 0 0]
[1 1 1]
[1 1 1]
[0 1 1]
[0 1 1]
[0 1 1]
[1 1 1]
[1 1 1]


In [76]:
out_tr = []
out_ts = []
for val in preds:
    if len(val) == len(data_X):
        out_tr.append(val)
    else:
        out_ts.append(val)

In [77]:
print(out_tr)

[array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 0, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 1, 0, 0]), array([0, 0, 0, ..., 1, 0, 0])]


In [78]:
prb_tr = get_probs(out_tr)
prb_ts = get_probs(out_ts)
#prb_tr = prb[:len(data_X)]
#prb_ts = prb[len(test_x):]

In [80]:
print(prb_ts)

[0.6875, 0.9375, 0.9375]


In [81]:
def jitter(lst, perc):
    ct = int(perc * 100)
    out = []
    for ix in range(len(lst)):
        rnd = randint(0, 100)
        if rnd > ct:
            out.append((lst[ix] + 1) % 2)
        else:
            out.append(lst[ix])
    return np.array(out)

In [82]:
j1 = jitter(out, 0)
j2 = jitter(out, 0)
j3 = jitter(out, 0)
if not DEBUG:
    j1_ts = jitter([0] * len(test_x), 0)
    j2_ts = jitter([0] * len(test_x), 0)
    j3_ts = jitter([0] * len(test_x), 0)

In [83]:
aug_tr = pd.DataFrame({"j1" : j1, "j2" : j2, "j3" : j3, "prb" : prb_tr})
if not DEBUG:
    aug_ts = pd.DataFrame({"j1" : j1_ts, "j2" : j2_ts, "j3" : j3_ts, "prb" : prb_ts})

In [85]:
data_X_aug = pd.concat([data_X, aug_tr], axis=1)
data_X_ess_aug = pd.concat([data_X_ess, aug_tr], axis=1)

if not DEBUG:
    test_X_aug = pd.concat([test_x, aug_ts], axis=1)
    test_X_ess_aug = pd.concat([test_x_ess, aug_ts], axis=1)

In [86]:
mdls = init_models(model_names, data_X_aug, data_Y, is_loaded=True)

ridge
{'alpha': 0.07788883556218609, 'random_state': 42, 'solver': 'auto'}
ridge
Number of features: 171
Average rmse: 0.6161875690828491
Complete training ridge RMSE = 0.5822724111302379
svr
{'kernel': 'rbf', 'C': 1.0, 'epsilon': 0.2668506195115871}
svr
Number of features: 171
Average rmse: 0.6022858172568097
Complete training svr RMSE = 0.48970625527736966
hgb
{'learning_rate': 0.012153706444287133, 'max_iter': 605, 'max_leaf_nodes': 20, 'l2_regularization': 0.00047211777048909563}
hgb
Number of features: 171
Average rmse: 0.5069572743609967
Complete training hgb RMSE = 0.332996058721254
lgbm
{'n_estimators': 1024, 'learning_rate': 0.004174925348929063, 'metric': 'rmse', 'random_state': 42, 'force_col_wise': True, 'verbosity': -1, 'reg_alpha': 0.15639029398585033, 'reg_lambda': 2.0055427457193318, 'colsample_bytree': 0.7190650020820821, 'subsample': 0.8288906555039297, 'num_leaves': 16, 'min_child_samples': 60}
lgbm
Number of features: 171
Training until validation scores don't impro

In [89]:
print(data_Y)

[3.5 3.5 6.  ... 1.5 5.  4. ]


In [91]:
if DEBUG:
    x = voting(mdls[2:], data_X_aug, data_Y, False, 69)
else:
    x = voting(mdls, data_X_aug, None, False, 91, Ytr = data_Y, Xts = test_X_aug)

[('ridge', Pipeline(steps=[('remove_infs',
                 FunctionTransformer(func=<function ModelTrainer.make_pipeline.<locals>.<lambda> at 0x7f840e88cd60>)),
                ('imputer', SimpleImputer()),
                ('normalizer',
                 FunctionTransformer(func=<function ModelTrainer.make_pipeline.<locals>.<lambda> at 0x7f840e9205e0>)),
                ('scaler', RobustScaler()),
                ('model', Ridge(alpha=0.07788883556218609, random_state=42))])), ('svr', Pipeline(steps=[('remove_infs',
                 FunctionTransformer(func=<function ModelTrainer.make_pipeline.<locals>.<lambda> at 0x7f840e88d4e0>)),
                ('imputer', SimpleImputer()),
                ('normalizer',
                 FunctionTransformer(func=<function ModelTrainer.make_pipeline.<locals>.<lambda> at 0x7f840e88d580>)),
                ('scaler', RobustScaler()),
                ('model', SVR(epsilon=0.2668506195115871))])), ('hgb', HistGradientBoostingRegressor(l2_regularization

In [93]:
submission = pd.DataFrame({'id': test_ids, 'score': x[2]})
submission.to_csv('submission.csv', index=False)
display(submission)

Unnamed: 0,id,score
0,0000aaaa,2.238019
1,2222bbbb,2.238019
2,4444cccc,2.238019


In [92]:
print(x)

[2.72851994 2.28538829 2.23801879]
