## Refer https://www.kaggle.com/code/mcpenguin/writing-processes-to-quality-baseline

In [19]:
#!pip install lightgbm --config-settings=cmake.define.USE_GPU=ON

In [57]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import re
from itertools import combinations
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from warnings import simplefilter
import joblib
import os
from scipy.stats import skew, kurtosis
from tqdm.autonotebook import tqdm
from collections import Counter
warnings.filterwarnings('ignore')
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
N_Folds = 5

In [4]:
train_file = "/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv"
train_scores_file = "/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv"
test_file = "/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv"

In [5]:
ds_train = pd.read_csv(train_file)
ds_train.head(5)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [6]:
ds_test = pd.read_csv(test_file)
ds_test.head(5)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


In [7]:
ds_train_scores = pd.read_csv(train_scores_file)
ds_train_scores.head(5)

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [8]:
def inspect_columns(df):
    # A helper function that does a better job than df.info() and df.describe()
    result = pd.DataFrame({
        'unique': df.nunique() == len(df),
        'cardinality': df.nunique(),
        'with_null': df.isna().any(),
        'null_pct': round((df.isnull().sum() / len(df)) * 100, 2),
        '1st_row': df.iloc[0],
        'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
        'last_row': df.iloc[-1],
        'dtype': df.dtypes
    })
    return result

In [9]:
inspect_columns(ds_train)

Unnamed: 0,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
id,False,2471,False,0.0,001519c8,361ba892,fff05981,object
event_id,False,12876,False,0.0,1,1624,3619,int64
down_time,False,1836078,False,0.0,4526,1485423,2070065,int64
up_time,False,1835993,False,0.0,4557,1485491,2070133,int64
action_time,False,3509,False,0.0,31,68,68,int64
activity,False,50,False,0.0,Nonproduction,Input,Input,object
down_event,False,131,False,0.0,Leftclick,q,.,object
up_event,False,130,False,0.0,Leftclick,q,.,object
text_change,False,4111,False,0.0,NoChange,q,.,object
cursor_position,False,7803,False,0.0,0,1172,1029,int64


In [10]:
inspect_columns(ds_test)

Unnamed: 0,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
id,False,3,False,0.0,0000aaaa,4444cccc,4444cccc,object
event_id,False,2,False,0.0,1,2,2,int64
down_time,True,6,False,0.0,338433,184996,184996,int64
up_time,True,6,False,0.0,338518,185052,185052,int64
action_time,True,6,False,0.0,85,56,56,int64
activity,False,1,False,0.0,Input,Input,Input,object
down_event,False,2,False,0.0,Space,q,q,object
up_event,False,2,False,0.0,Space,q,q,object
text_change,False,2,False,0.0,,q,q,object
cursor_position,False,2,False,0.0,0,1,1,int64


In [11]:
inspect_columns(ds_train_scores)

Unnamed: 0,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
id,True,2471,False,0.0,001519c8,a294b044,fff05981,object
score,False,12,False,0.0,3.5,3.0,4.0,float64


In [12]:
class Preprocessor:
    
    def __init__(self, seed):
        self.seed = seed
        
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        self.text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 100]
#         self.gaps = [1, 2]
    
    def activity_counts(self, df):
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['activity'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.activities:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        return ret


    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df[colname].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        return ret


    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['text_change'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        return ret

    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['down_event'].values):
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret


    def get_input_words(self, df):
        tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df
    
    def make_feats(self, df):
        
        print("Starting to engineer features")
        
        # initialize features dataframe
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        
        # get shifted features
        # time shift
        print("Engineering time data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)

        # cursor position shift
        print("Engineering cursor position data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)

        # word count shift
        print("Engineering word count data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)
        
        # get aggregate statistical features
        print("Engineering statistical summaries for features")
        # [(feature name, [ stat summaries to add ])]
        feats_stat = [
            ('event_id', ['max']),
            ('up_time', ['max']),
            ('action_time', ['sum', 'max', 'mean', 'std']),
            ('activity', ['nunique']),
            ('down_event', ['nunique']),
            ('up_event', ['nunique']),
            ('text_change', ['nunique']),
            ('cursor_position', ['nunique', 'max', 'mean']),
            ('word_count', ['nunique', 'max', 'mean'])]
        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['max', 'min', 'mean', 'std', 'sum', skew, kurtosis]),
                (f'cursor_position_change{gap}', ['max', 'mean', 'std', 'sum', skew, kurtosis]),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'sum', skew, kurtosis])
            ])
        
        pbar = tqdm(feats_stat)
        for item in pbar:
            colname, methods = item[0], item[1]
            for method in methods:
                pbar.set_postfix()
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                    
                pbar.set_postfix(column=colname, method=method_name)
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left')

        # counts
        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        # input words
        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')

        # compare feats
        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']
        
        print("Done!")
        return feats

In [59]:
preprocessor = Preprocessor(19750316)
print("Engineering features for training data")
train_ds = preprocessor.make_feats(ds_train)
print()
print("-"*25)
print("Engineering features for test data")
print("-"*25)
X_test = preprocessor.make_feats(ds_test)
X_test = X_test.drop(columns=["id"])
train_ds= train_ds.merge(ds_train_scores, on='id', how='left')
X = train_ds.drop(columns=["id","score"])
Y = train_ds.score
assert (X.columns == X_test.columns).all()

Engineering features for training data
Starting to engineer features
Engineering time data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering cursor position data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering word count data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering statistical summaries for features


  0%|          | 0/33 [00:00<?, ?it/s]

Engineering activity counts data


  0%|          | 0/2471 [00:00<?, ?it/s]

Engineering event counts data


  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

Engineering text change counts data


  0%|          | 0/2471 [00:00<?, ?it/s]

Engineering punctuation counts data


  0%|          | 0/2471 [00:00<?, ?it/s]

Engineering input words data
Engineering ratios data
Done!

-------------------------
Engineering features for test data
-------------------------
Starting to engineer features
Engineering time data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering cursor position data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering word count data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering statistical summaries for features


  0%|          | 0/33 [00:00<?, ?it/s]

Engineering activity counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering event counts data


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Engineering text change counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering punctuation counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering input words data
Engineering ratios data
Done!


#X = features(ds_train)
#X_test = features(ds_test)
#Y = ds_train_scores.score
#assert (X.columns == X_test.columns).all()

In [60]:
X

Unnamed: 0,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,...,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,17,...,0,37,366,5.325137,20,3.487804,0.000142,0.100117,0.001419,0.832534
1,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,12,...,0,53,385,4.410390,33,3.199496,0.000181,0.131622,0.001372,0.828944
2,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,19,...,0,47,627,5.446571,25,3.474895,0.000228,0.097679,0.002335,0.759751
3,1556,1404469,189596,806,121.848329,113.768226,5,15,15,10,...,0,18,251,4.609562,19,2.949601,0.000147,0.132391,0.001108,0.835531
4,2531,1662472,313702,701,123.943896,62.082013,3,11,11,9,...,0,66,412,4.766990,18,2.986064,0.000152,0.099565,0.001522,0.764103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,4739,1791649,499670,3323,105.437856,63.622575,4,13,13,13,...,2,88,734,4.915531,20,3.001989,0.000257,0.097278,0.002645,0.708572
2467,2604,1799174,214221,1144,82.266129,36.178818,4,11,11,8,...,0,63,470,4.085106,13,2.231589,0.000243,0.168203,0.001447,0.868855
2468,3063,1959363,231580,564,75.605615,63.494975,3,11,11,6,...,0,7,222,4.644144,15,2.707087,0.000103,0.065622,0.001563,0.869824
2469,3242,1508504,289439,1388,89.277915,54.515788,3,15,15,13,...,0,70,500,5.294000,24,3.541689,0.000274,0.127390,0.002149,0.794947


In [61]:
params = {
'learning_rate': 0.009,#0.009
'max_depth': 13,#7
'n_estimators': 700,
'num_leaves': 500,#440,
'min_child_samples': 66,
'objective': 'mae',
'random_state': 43,
'reg_alpha': 0.01,
'reg_lambda': 0.01,
################################################    
# "device": "gpu",
# "gpu_platform_id": 0,
# "gpu_device_id": 0     
}

In [62]:
kf = KFold(n_splits=N_Folds, shuffle=True, random_state=100)
mae_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, Y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = Y[train_idx], Y[valid_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)

    m = lgb.train(params, train_data, valid_sets=[train_data, valid_data],verbose_eval=50, early_stopping_rounds=50)
    print(f"Fold {fold+1} Trainning finished.")

    model_filename = f"/kaggle/working/model_fold_{fold+1}.pkl"
    joblib.dump(m, model_filename)
    y_pred_valid = m.predict(X_valid)

    y_pred_valid = np.nan_to_num(y_pred_valid)
    y_valid = np.nan_to_num(y_valid)
    mae = mean_absolute_error(y_valid, y_pred_valid)
    print("############mae##############:",mae)
    mae_scores.append(mae)

# 计算4折平均的MAE
average_mae = np.mean(mae_scores)
print(f"{N_Folds} fold MAE: {average_mae}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35340
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 179
[LightGBM] [Info] Start training from score 4.000000
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 0.647944	valid_1's l1: 0.689988
[100]	training's l1: 0.557422	valid_1's l1: 0.601054
[150]	training's l1: 0.502254	valid_1's l1: 0.55372
[200]	training's l1: 0.467544	valid_1's l1: 0.528609
[250]	training's l1: 0.442605	valid_1's l1: 0.513554
[300]	training's l1: 0.423794	valid_1's l1: 0.502677
[350]	training's l1: 0.409832	valid_1's l1: 0.49838
[400]	training's l1: 0.398226	valid_1's l1: 0.495148
[450]	training's l1: 0.388514	valid_1's l1: 0.493127
[500]	training's l1: 0.380038	valid_1's l1: 0.490746
[550]	training's l1: 0.372557	valid_1's l1: 0.489427
[600]	training's l1: 0.366045	valid_1's l1: 0.488356
[650]	training's l1: 0.360162	valid_1's l1: 0.487336
[700]	training

In [63]:
fold_prediction = 0
for fold in range(0, N_Folds):
    model_filename = f"/kaggle/working/model_fold_{fold+1}.pkl"
    m = joblib.load(model_filename)
    fold_prediction += m.predict(X_test, predict_disable_shape_check=True)   

fold_prediction /= N_Folds
submission = ds_test.groupby("id")["up_time"].max().reset_index()[["id"]]
submission["score"] = fold_prediction
submission.to_csv("submission.csv",index=False)

In [64]:
pd.read_csv("/kaggle/working/submission.csv")

Unnamed: 0,id,score
0,0000aaaa,1.970663
1,2222bbbb,1.935695
2,4444cccc,1.937107
