<a href="https://www.kaggle.com/code/kaylaippp/copy-of-0-63-submission-writing-quality-kaylaipp?scriptVersionId=191978384" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<font color=blue><h1><b>Imports</b></h1></font>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder,power_transform
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor


import math
import scipy.stats as stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter
import re
from scipy.stats import entropy

In [2]:
INPUT_DIR = '/kaggle/input/linking-writing-processes-to-writing-quality'
trainLogs_df = pd.read_csv(f'{INPUT_DIR}/train_logs.csv')
trainScores_df = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
validationLogs_df = pd.read_csv(f'{INPUT_DIR}/test_logs.csv')

<font color=blue><h1><b>Feature Engineering</b></h1></font>

In [3]:
# helper fucntion for calculating verbosity feature
def calculate_entropy(x):
    return entropy(x / np.sum(x))

# helper fucntion for calculating verbosity features
def calculate_slope(x):
    return np.polyfit(np.arange(len(x)), x, 1)[0]

# helper fucntion for calculating verbosity feature
def calculate_uniformity(x):
    uniform_distribution = np.ones_like(x) / len(x)
    return entropy(x, uniform_distribution)

# https://link.springer.com/article/10.1007/s40593-021-00268-w
def create_verbosity_features(df: pd.DataFrame): 
    new_df = df.copy(deep=True)
    grouped_df = new_df.groupby('id')
    
    # Total Number of Keystrokes
    new_df['total_keystrokes'] = grouped_df['action_time'].transform('sum')
    
    # Total number of words
    new_df['total_words'] = grouped_df['word_count'].transform('sum')
    
    # SD number of keystrokes per 30 s
    window_size = 30  # seconds
    new_df['keystrokes_per_window'] = grouped_df['action_time'].transform('sum')
    new_df['sd_keystrokes_per_30s'] = grouped_df['keystrokes_per_window'].transform('std')
    
    # Slope of the number of keystrokes per 30 s.
    new_df['slope_keystrokes_per_30s'] = grouped_df['action_time'].transform(calculate_slope)
    
    # Entropy of the number of keystrokes per 30 s
    new_df['entropy_keystrokes_per_30s'] = grouped_df['action_time'].transform(calculate_entropy)
    
    # Uniformity of the number of keystrokes per 30 s.
    new_df['uniformity_keystrokes_per_30s'] = grouped_df['action_time'].transform(calculate_uniformity)
    
    # Local extreme number of keystrokes per 30 s
    new_df['keystrokes_diff_sign'] = np.sign(grouped_df['action_time'].diff().diff())
    new_df['local_extreme_count'] = grouped_df['keystrokes_diff_sign'].apply(lambda x: np.sum(x != 0))
    
    # Mean and SD distance 30 s windows of more than one keystroke
    new_df['keystrokes_per_window'] = grouped_df['action_time'].transform('sum')
    windows_with_more_than_one_keystroke = new_df[new_df['keystrokes_per_window'] > 1].groupby('id')
    new_df['mean_distance_between_windows'] = windows_with_more_than_one_keystroke['action_time'].diff().mean()
    new_df['sd_distance_between_windows'] = windows_with_more_than_one_keystroke['action_time'].diff().std()
    
    # Drop helper cols that we dont need anymore
    new_df = new_df.drop(['keystrokes_per_window', 'keystrokes_diff_sign'], axis=1)
    
    return new_df


# Basic data preparation
def create_features(df: pd.DataFrame):
    # Stolen shamlessely from "Writing Quality LOFO Feature Importance" here:
    # https://www.kaggle.com/code/aerdem4/writing-quality-lofo-feature-importance
    new_df = df
    new_df['wait_time'] = new_df['down_time'] - new_df.groupby('id')['up_time'].shift()
    new_df['activity'] = new_df['activity'].apply(lambda x: 'Move' if x.startswith('Move') else x)
    mask = new_df['wait_time'] < 0
    new_df.loc[mask, 'wait_time'] = 0


    temp_df = new_df[new_df['activity'] != 'Remove/Cut'].groupby('id').agg({'text_change': list}).reset_index()
    temp_df['text_change'] = temp_df['text_change'].apply(lambda x: ''.join(x)).apply(lambda x: re.findall(r'q+', x))
    temp_df['input_word_length_mean'] = temp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    temp_df['input_word_length_max'] = temp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    temp_df['input_word_length_std'] = temp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    temp_df.drop(['text_change'], axis=1, inplace=True)


    # One hot encoding of activity
    #activity_ohe_df = pd.get_dummies(df['activity'])
    #activity_cols = list(activity_ohe_df.columns)

    # Remerge ohe columns with rest of df
    #df = pd.concat([df, activity_ohe_df], axis=1)
    # Create some aggregate statistic features
    agg = { "event_id" : ["last"],
        "wait_time": ["mean", "max", "sum"],
        "word_count": ["last"],
        "up_time": [ "max"],
        "down_time": ["min"],
        "valid_sentences" : ["sum"],
        "valid_paras" : ["sum"],
         "Nav_action_time" : ["sum"],
         "Wasted_action_time" : ["sum"],
         "Wasted_Action_Events" : ["sum"]
        }

    shiftedCols = ['cursor_position', 'word_count']
    gaps = [10,100, 200]

    getTimeGapData(new_df, shiftedCols, gaps)
    getActionGapData (new_df, gaps)
    for gap in gaps:
        agg["action_time_shift_" + str(gap)] = ["mean", "max", "std"]
        for col in shiftedCols:
              agg[str(col) + "_shift_" + str(gap)] = ["mean", "max", "std"]

    pbar = agg
    colNames = []
    for item in pbar:
        colname, methods = item[0], item[1]
        for method in methods:
            colNames.append(colname + "_" + method)
    new_df = new_df.groupby(['id']).agg(agg).reset_index()
    new_df.columns = [str(col[0] + "_" + col[1]) for col in new_df.columns] # some colum name cleaning


  # Taken and modified from https://www.kaggle.com/code/abhranta/lgbm-finetuning-with-optuna
    new_df['input_word_length_mean'] = temp_df['input_word_length_mean']
    new_df['input_word_length_max'] = temp_df['input_word_length_max']
    new_df['input_word_length_std'] = temp_df['input_word_length_std']

    # Our own development
    new_df['total_time'] = new_df['up_time_max'] - new_df['down_time_min']
    new_df.drop(['up_time_max', 'down_time_min'], axis=1, inplace=True)


    new_df["valid_sentences_sum"] = new_df["valid_sentences_sum"] + 1
    new_df["valid_paras_sum"] = new_df["valid_paras_sum"] + 1
    new_df["Nav_time_ratio"] = new_df ["Nav_action_time_sum"] / new_df['total_time']
    new_df['Wasted_time_ratio'] = new_df['Wasted_action_time_sum'] / new_df['total_time']
    new_df['Wasted_actions_per_word'] = new_df['Wasted_Action_Events_sum'] / new_df['word_count_last']
    new_df['Idle_time_ratio'] = new_df['wait_time_sum'] / new_df['total_time']

    new_df['Word_rate'] = new_df['word_count_last'] / new_df['total_time']
    new_df['Word_event_rate'] = new_df['word_count_last'] / new_df['event_id_last']
    new_df['event_rate'] = new_df['event_id_last'] / new_df['total_time']
    new_df['Sentences_per_para'] = new_df['valid_sentences_sum']/ new_df['valid_paras_sum']
    new_df['words_per_sentences'] = new_df['word_count_last']/ new_df['valid_sentences_sum']
    new_df.drop(['Nav_action_time_sum', 'Wasted_action_time_sum', 'Wasted_Action_Events_sum', 'event_id_last' ], axis=1, inplace=True)
    return new_df


def findTotalValidSentences(df:pd.DataFrame):
    df['valid_sentences'] = (df ['down_event'] == '.')
    return df

def findTotalParagraphs(df:pd.DataFrame):
    df['Shifted .'] = df.groupby('id')['down_event'].shift(1)
    df['valid_paras'] = ((df ['down_event'] == 'Enter') & (df['Shifted .'] != 'Enter'))
    df.drop('Shifted .' , axis = 1, inplace = True)
    return df

def findNavActions(df:pd.DataFrame):
    mask = ((df['up_event'] == 'ArrowDown') | (df['up_event'] == 'ArrowUp') | (df['up_event'] == 'ArrowLeft') | (df['up_event'] == 'ArrowRight'))
    print(mask.value_counts())  
    df['Nav_action_time'] = 0
    print(df[mask]['action_time'].sum())
    df.loc[mask, 'Nav_action_time'] =df.loc[mask, 'action_time']
    print(df['Nav_action_time'].sum())
    return df

def totalWastedTime(df:pd.DataFrame):
    mask = df['activity'] == 'Remove/Cut'
    df['Wasted_action_time'] = 0
    df['Wasted_Action_Events'] = mask
    df.loc[mask, 'Wasted_action_time'] =df.loc[mask, 'action_time']
    print(df['Wasted_action_time'].sum())
    return df


def getTimeGapData(df:pd.DataFrame, colList, gapList):
    for col in colList:
        for gap in gapList:
            df["temp"] = df.groupby('id')[col].shift(gap)
            df[str(col) + "_shift_" + str(gap)] = np.abs(df[col] - df["temp"])
            df.drop(columns=["temp"], inplace=True)
            df[str(col) + "_shift_" + str(gap)].fillna(0, inplace=True)
        
        
def getActionGapData (df:pd.DataFrame, gapList):
    for gap in gapList:
        df["up_time_shift_" + str(gap)] = df.groupby('id')['up_time'].shift(gap)
        df["action_time_shift_" + str(gap)] = df['down_time'] - df["up_time_shift_" + str(gap)]
        df["action_time_shift_" + str(gap)].fillna(0, inplace = True)
        df.drop(columns = ["up_time_shift_" + str(gap)], inplace = True)

<font color=blue><h1><b>Apply feature engineering</b></h1></font>

In [4]:
trainlogs_2 = trainLogs_df.copy(deep = True)
trainlogs_2 = findTotalValidSentences(trainlogs_2)
trainlogs_2 = findTotalParagraphs(trainlogs_2)
trainlogs_2 = findNavActions(trainlogs_2)
trainlogs_2 = totalWastedTime (trainlogs_2)
trainlogs_2 = create_verbosity_features(trainlogs_2)
trainlogs_2 = create_features (trainlogs_2)
print('applied features on training logs')


validationlogs_2 = validationLogs_df.copy(deep = True)
validationlogs_2 = findTotalValidSentences(validationlogs_2)
validationlogs_2 = findTotalParagraphs(validationlogs_2)
validationlogs_2 = findNavActions(validationlogs_2)
validationlogs_2 = totalWastedTime (validationlogs_2)
validationlogs_2 = create_verbosity_features(validationlogs_2)
validationlogs_2 = create_features (validationlogs_2)
validationlogs_2.fillna(0, inplace = True)
print('applied features on validation logs')

up_event
False    8162280
True      243618
Name: count, dtype: int64
15337923
15337923
72942446
applied features on training logs
up_event
False    6
Name: count, dtype: int64
0
0
0
applied features on validation logs


In [5]:
# drop id columns

trainlogs_2.rename(columns={"id_":"id"}, inplace=True)
trainlogs_2 = trainlogs_2.merge(trainScores_df, on = 'id')
trainlogs_2.drop(columns='id', axis=1, inplace=True)

validationlogs_2.rename(columns={"id_":"id"}, inplace=True)
validation_ids = validationlogs_2['id']
validationlogs_2.drop(columns='id', axis=1, inplace=True)
features = trainlogs_2.columns[:-1]
print(features)

Index(['wait_time_mean', 'wait_time_max', 'wait_time_sum', 'word_count_last',
       'valid_sentences_sum', 'valid_paras_sum', 'action_time_shift_10_mean',
       'action_time_shift_10_max', 'action_time_shift_10_std',
       'cursor_position_shift_10_mean', 'cursor_position_shift_10_max',
       'cursor_position_shift_10_std', 'word_count_shift_10_mean',
       'word_count_shift_10_max', 'word_count_shift_10_std',
       'action_time_shift_100_mean', 'action_time_shift_100_max',
       'action_time_shift_100_std', 'cursor_position_shift_100_mean',
       'cursor_position_shift_100_max', 'cursor_position_shift_100_std',
       'word_count_shift_100_mean', 'word_count_shift_100_max',
       'word_count_shift_100_std', 'action_time_shift_200_mean',
       'action_time_shift_200_max', 'action_time_shift_200_std',
       'cursor_position_shift_200_mean', 'cursor_position_shift_200_max',
       'cursor_position_shift_200_std', 'word_count_shift_200_mean',
       'word_count_shift_200_max', 

In [6]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson', standardize=True)
trainlogs_2[features] = pt.fit_transform(trainlogs_2[features])
validationlogs_2[features] = pt.transform(validationlogs_2[features])

In [7]:
trainScore = trainlogs_2['score']
trainlogs_2.drop('score', axis = 1, inplace = True)
print(trainlogs_2.columns)

Index(['wait_time_mean', 'wait_time_max', 'wait_time_sum', 'word_count_last',
       'valid_sentences_sum', 'valid_paras_sum', 'action_time_shift_10_mean',
       'action_time_shift_10_max', 'action_time_shift_10_std',
       'cursor_position_shift_10_mean', 'cursor_position_shift_10_max',
       'cursor_position_shift_10_std', 'word_count_shift_10_mean',
       'word_count_shift_10_max', 'word_count_shift_10_std',
       'action_time_shift_100_mean', 'action_time_shift_100_max',
       'action_time_shift_100_std', 'cursor_position_shift_100_mean',
       'cursor_position_shift_100_max', 'cursor_position_shift_100_std',
       'word_count_shift_100_mean', 'word_count_shift_100_max',
       'word_count_shift_100_std', 'action_time_shift_200_mean',
       'action_time_shift_200_max', 'action_time_shift_200_std',
       'cursor_position_shift_200_mean', 'cursor_position_shift_200_max',
       'cursor_position_shift_200_std', 'word_count_shift_200_mean',
       'word_count_shift_200_max', 

In [8]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
sfs = SequentialFeatureSelector(lr, direction="backward", scoring = 'neg_mean_absolute_error', tol = 0.001, n_features_to_select='auto')
sfs.fit(trainlogs_2, trainScore)
sfs.get_support()
sfs.get_feature_names_out(features)

trainlogs_2 = sfs.transform(trainlogs_2)
validationlogs_2 = sfs.transform(validationlogs_2)

### MSE scores

In [9]:
def print_mse(model):
    ## output training and validation MSE (on training subset)
    x_train, x_test, y_train, y_test = train_test_split(trainlogs_2, trainScore, test_size=0.20, random_state=43)
    y_pred = model.predict(x_test)
    y_pred = [5 if x > 5 else x for x in y_pred]
    print('training MSE: ', math.sqrt(mean_squared_error(y_train, model.predict(x_train))))
    print('validation MSE: ', math.sqrt(mean_squared_error(y_test, y_pred)))

<font color=blue><h1><b> Stacked model </b></h1></font>

- stacking 3 models, CatBoostRegressor, RandomForestRegressor and a NN (MLPRegressor) and using linear regression as the final estimator 

In [10]:
nn = MLPRegressor(hidden_layer_sizes=(100,), max_iter=5000, activation='logistic')
cat_boost_model = CatBoostRegressor(
    iterations=300,         
    learning_rate=0.05,     
    l2_leaf_reg= 1,
    depth=5,                
    loss_function='RMSE',
    random_seed=42,
    verbose=0
)
random_forest_model = RandomForestRegressor(random_state=42, bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300)
stacked_model = StackingRegressor(
    estimators=[
        ('mlpr', make_pipeline(StandardScaler(), nn)),
        ('rf', make_pipeline(StandardScaler(), random_forest_model)),
        ('cbr', make_pipeline(StandardScaler(), cat_boost_model)),
    ],
    final_estimator=LinearRegression(),
    cv=7
)
stacked_model.fit(trainlogs_2, trainScore)
y_pred = stacked_model.predict(validationlogs_2)
submission_df = pd.DataFrame({
    'id': validation_ids,
    'score': y_pred
})

## output training and validation MSE (on training subset)
print_mse(stacked_model)

training MSE:  0.45585646280758346
validation MSE:  0.4395598073972519


In [11]:
submission_df.to_csv('submission.csv', index=False)