In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from Functionality import Reduce_numerical_columns,Reduce_text_change,Reduce_event,Reduce_activity,getX_Y,getModel,performCrossValidation,makePredictions,perfromGridSearch,performKfoldScore,Aggregation,ConcatAlongId
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Directories Where the data is present
train_logs_directory = os.path.join("Data","train_logs.csv")
train_scores_directory = os.path.join("Data","train_scores.csv")
test_logs_directory = os.path.join("Data","test_logs.csv")

# Loading Dataset

In [3]:
# following the naming convention that everything related
# to train will be followed by train_ and everything followed by test will be followed by test_
train_logs_df = pd.read_csv(train_logs_directory)
test_logs_df = pd.read_csv(test_logs_directory)
train_scores_df = pd.read_csv(train_scores_directory)

In [4]:
train_df = pd.merge(train_logs_df,train_scores_df,on = "id",how = "inner")
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


# Exploratory Data Analysis

In [5]:
# No null values are present
# train_logs_df.isnull().sum()

In [6]:
train_df.dtypes

id                  object
event_id             int64
down_time            int64
up_time              int64
action_time          int64
activity            object
down_event          object
up_event            object
text_change         object
cursor_position      int64
word_count           int64
score              float64
dtype: object

In [7]:
train_df.columns

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')

In [8]:
train_df["up_event"].unique()

array(['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
       'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
       'ArrowUp', 'ArrowDown', 'm', 'Rightclick', 'i', 'o', 't', '=', 'a',
       'CapsLock', 'Control', 'c', 'v', '/', 'Delete', ':', 'z', '[', '$',
       '(', ')', '+', 'Home', 'End', '\\', 'Meta', '*', '&',
       'AudioVolumeMute', 'x', '!', 'Insert', 'MediaPlayPause', 'w',
       'NumLock', '%', 'V', 'b', '>', 'Alt', 'AudioVolumeUp',
       'ContextMenu', 'AudioVolumeDown', 'n', 'e', '<', 'PageDown', ']',
       'Middleclick', '@', 'F12', 'u', 'j', 's', '\x96', 'Dead', 'y', '{',
       'ScrollLock', '¿', 'p', 'Process', '}', 'MediaTrackPrevious',
       'MediaTrackNext', 'F3', '^', 'Unidentified', 'Cancel', 'h', '2',
       'd', 'r', '`', '\x9b', 'f', 'g', '#', '~', 'PageUp', 'l', 'T', 'A',
       'S', 'ModeChange', '_', 'Escape', 'F11', 'Unknownclick',
       'AltGraph', 'F10', 'F15', 'Clear', 'OS', 'C', 'Ä±', 'M', '|',
       'â\x80\x93', 

In [9]:
# Highly skewed dataset
train_df["activity"].unique()

array(['Nonproduction', 'Input', 'Remove/Cut', 'Replace',
       'Move From [284, 292] To [282, 290]',
       'Move From [287, 289] To [285, 287]',
       'Move From [460, 461] To [465, 466]', 'Paste',
       'Move From [905, 1314] To [907, 1316]',
       'Move From [565, 743] To [669, 847]',
       'Move From [669, 847] To [565, 743]',
       'Move From [1041, 1121] To [1496, 1576]',
       'Move From [1455, 1557] To [1323, 1425]',
       'Move From [2268, 2275] To [2247, 2254]',
       'Move From [213, 302] To [902, 991]',
       'Move From [0, 158] To [234, 392]',
       'Move From [460, 465] To [925, 930]',
       'Move From [810, 906] To [816, 912]',
       'Move From [186, 187] To [184, 185]',
       'Move From [140, 272] To [299, 431]',
       'Move From [114, 140] To [272, 298]',
       'Move From [1386, 1450] To [1445, 1509]',
       'Move From [442, 524] To [296, 378]',
       'Move From [408, 414] To [390, 396]',
       'Move From [1144, 1147] To [1142, 1145]',
       'Move 

In [10]:
train_df["activity"]

0          Nonproduction
1          Nonproduction
2          Nonproduction
3                  Input
4                  Input
               ...      
8405893    Nonproduction
8405894    Nonproduction
8405895          Replace
8405896    Nonproduction
8405897            Input
Name: activity, Length: 8405898, dtype: object

In [11]:
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


In [12]:
train_df["text_change"]

0          NoChange
1          NoChange
2          NoChange
3                 q
4                 q
             ...   
8405893    NoChange
8405894    NoChange
8405895      q => q
8405896    NoChange
8405897           .
Name: text_change, Length: 8405898, dtype: object

# Data Filteration

In [20]:
# Splitting the dataset between the categorical and numerical values
num_attributes = ["id","event_id","down_time","up_time","action_time","cursor_position","word_count"]
cat_attributes = ["activity","down_event","up_event","text_change"]

In [21]:
processing = ColumnTransformer([
    ("RemoveId",make_pipeline(Reduce_numerical_columns()),num_attributes),
     ("ValueSum",make_pipeline(Reduce_text_change()),["text_change","id"]),
    ("RemoveMove",make_pipeline(Reduce_activity(),OneHotEncoder(sparse_output=False)),["activity"]),
    ("ReduceUpEvents",make_pipeline(Reduce_event()),["up_event"]),
    # ("ReduceDownEvents",make_pipeline(Reduce_event()),["down_event"]),   
],
    # remainder="passthrough"
)

In [22]:
train_logs_df,y = getX_Y(train_logs_df,train_scores_df,perform_harmonic_variation=False,aggregation=True)
train_processed_numpy = processing.fit_transform(train_logs_df)
test_processed_numpy = processing.transform(test_logs_df)
train_processed_df = pd.DataFrame(train_processed_numpy,columns=processing.get_feature_names_out())
test_processed_df = pd.DataFrame(test_processed_numpy,columns=processing.get_feature_names_out())

In [23]:
post_processing = make_pipeline(Aggregation())

In [24]:
# Concating the columns 
train_postprocessed_df = ConcatAlongId(train_processed_df,train_logs_df)
test_postprocessed_df = ConcatAlongId(test_processed_df,test_logs_df)

# Aggreagating the columns for both train and test
train_postprocessed_numpy = post_processing.fit_transform(train_postprocessed_df)
train_postprocessed_df = pd.DataFrame(train_postprocessed_numpy,columns=post_processing.get_feature_names_out())
test_postprocessed_numpy = post_processing.transform(test_postprocessed_df)
test_postprocessed_df = pd.DataFrame(test_postprocessed_numpy,columns=post_processing.get_feature_names_out())

  X[f'word_count_change{gap}'] = X['word_count'] - X[f'word_count_shift{gap}']
  X[f'word_count_abs_change{gap}'] = np.abs(X[f'word_count_change{gap}'])
  X[f'word_count_change{gap}'] = X['word_count'] - X[f'word_count_shift{gap}']
  X[f'word_count_abs_change{gap}'] = np.abs(X[f'word_count_change{gap}'])


In [26]:
test_postprocessed_df

Unnamed: 0,mean_action_time,std_action_time,min_action_time,max_action_time,first_action_time,last_action_time,sem_action_time,median_action_time,skew_action_time,sum_action_time,...,mean_Numbers_800,std_Numbers_800,min_Numbers_800,max_Numbers_800,first_Numbers_800,last_Numbers_800,sem_Numbers_800,median_Numbers_800,skew_Numbers_800,sum_Numbers_800
0,86.0,1.414214,85.0,87.0,85.0,87.0,1.0,86.0,,172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,56.5,14.849242,46.0,67.0,67.0,46.0,10.5,56.5,,113.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,75.0,26.870058,56.0,94.0,94.0,56.0,19.0,75.0,,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


In [31]:
from sklearn import metrics, model_selection, preprocessing, linear_model, ensemble, decomposition, tree
import lightgbm as lgb
models_dict = {}
scores = []

test_predict_list = []
best_params = {'reg_alpha': 0.007678095440286993,
               'reg_lambda': 0.14230534302168353,
               'colsample_bytree': 0.627061253588415,
               'subsample': 0.854942238828458,
               'learning_rate': 0.038697981947473245,
               'num_leaves': 70,
               'max_depth': 50,
               'min_child_samples': 75}

train_feats = train_postprocessed_df
test_feats = test_postprocessed_df
train_cols = train_postprocessed_df.columns

for i in range(5):
    kf = model_selection.KFold(n_splits=10, random_state=42 + i, shuffle=True)
    oof_valid_preds = np.zeros(train_feats.shape[0])
    X_test = test_feats[train_cols]
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train_feats)):

        X_train = train_feats.iloc[train_idx][train_cols]
        X_valid = train_feats.iloc[valid_idx][train_cols]
        y_train = y[train_idx]
        y_valid = y[valid_idx]
        params = {
            "objective": "regression",
            "metric": "rmse",
            'random_state': 42,
            "n_estimators" : 12001,
            "verbosity": -1,
            **best_params
        }
        model = lgb.LGBMRegressor(**params)
        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        verbose_callback = lgb.log_evaluation(100)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                  callbacks=[early_stopping_callback, verbose_callback],
                  )
        valid_predict = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_predict
        test_predict = model.predict(X_test)
        test_predict_list.append(test_predict)
        score = metrics.mean_squared_error(y_valid, valid_predict, squared=False)
        models_dict[f'{fold}_{i}'] = model

    oof_score = metrics.mean_squared_error(y, oof_valid_preds, squared=False)
    scores.append(oof_score)

[100]	valid_0's rmse: 0.644185
[200]	valid_0's rmse: 0.646571
[100]	valid_0's rmse: 0.562018
[200]	valid_0's rmse: 0.561547
[300]	valid_0's rmse: 0.561316
[100]	valid_0's rmse: 0.727585
[200]	valid_0's rmse: 0.730028
[300]	valid_0's rmse: 0.733917
[100]	valid_0's rmse: 0.697641
[200]	valid_0's rmse: 0.699096
[300]	valid_0's rmse: 0.701697
[100]	valid_0's rmse: 0.676126
[200]	valid_0's rmse: 0.68203
[100]	valid_0's rmse: 0.692979
[200]	valid_0's rmse: 0.676704
[300]	valid_0's rmse: 0.670508
[400]	valid_0's rmse: 0.670738
[100]	valid_0's rmse: 0.725204
[200]	valid_0's rmse: 0.718462
[300]	valid_0's rmse: 0.718684
[100]	valid_0's rmse: 0.663184
[200]	valid_0's rmse: 0.660361
[300]	valid_0's rmse: 0.659827
[400]	valid_0's rmse: 0.660039
[500]	valid_0's rmse: 0.661601
[100]	valid_0's rmse: 0.696593
[200]	valid_0's rmse: 0.701925
[300]	valid_0's rmse: 0.702921
[100]	valid_0's rmse: 0.613314
[200]	valid_0's rmse: 0.613755
[300]	valid_0's rmse: 0.618255
[100]	valid_0's rmse: 0.659481
[200]	val

In [33]:
scores

[0.6664242919970254,
 0.6703507086999465,
 0.668650869318866,
 0.670750886559023,
 0.6715081450109638]

In [28]:
scores

[0.6627099061822196,
 0.6663073820494325,
 0.6669482420397865,
 0.668449212538129,
 0.6713877681965352]

In [33]:
model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=10,
    min_samples_split=3,
    min_samples_leaf=5,
    subsample=0.9,
    random_state=42
)
# model = RandomForestRegressor(
#     n_estimators=3000,
#     max_features='sqrt',
#     max_depth=None,
#     min_samples_split=3,
#     min_samples_leaf=1,
#     bootstrap=True,
#     random_state=42
# )

In [34]:
results = performCrossValidation(model,train_postprocessed_df,train_logs_df,y,aggregation=True)

In [35]:
results

Unnamed: 0,test_score,train_score
0,0.679143,0.16604
1,0.658144,0.164997
2,0.696474,0.163185
3,0.710926,0.165905
4,0.632641,0.173835
5,0.716814,0.163232


In [32]:
results

Unnamed: 0,test_score,train_score
0,0.670318,0.257587
1,0.653615,0.258039
2,0.688237,0.254832
3,0.699194,0.253688
4,0.632028,0.260467
5,0.704828,0.254325


In [20]:
results

Unnamed: 0,test_score,train_score
0,0.670786,0.348993
1,0.657357,0.348817
2,0.689015,0.345018
3,0.703558,0.343439
4,0.634287,0.351747
5,0.707085,0.345408


In [None]:
model.fit(train_postprocessed_df,y)

In [None]:
dataset_train = makePredictions(model,train_postprocessed_df,train_logs_df,aggregation=True)

In [None]:
dataset_test = makePredictions(model,test_postprocessed_df,test_logs_df,aggregation=True) 

In [None]:
mean_squared_error(y,dataset_train)

In [None]:
# For submission
dataset_test["y_pred"].to_csv("submission.csv")

# For Model Optimization

In [None]:
model,params = getModel("RandomForestRegressor")

In [None]:
# scores = performCrossValidation(model,train_processed_df,train_logs_df,y)
model,results = perfromGridSearch(model,params,train_postprocessed_df,train_logs_df,y,results=True,aggregation=True)

In [None]:
# model.fit(train_processed_df,y)
dataset_train = makePredictions(model,train_postprocessed_df,train_logs_df,aggregation=True)

In [None]:
dataset_test = makePredictions(model,test_processed_df,test_logs_df,aggregation=True)

In [None]:
dataset_train.to_csv("train.csv")

In [None]:
mean_squared_error(dataset_train["y_true"],dataset_train["y_pred"])

In [None]:
dataset_test["y_pred"].to_csv("submission.csv")

# Optimize with Optuna

In [None]:
def objective(trial):
    model,params = getModel("RandomForestRegressor",optuna=True,trial=trial)
    score = performKfoldScore(model,train_processed_df,train_logs_df,y,k=3,optuna = True,trial = trial)
    params[0]['n_jobs']=[-1]
    return score
    

In [None]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    storage = "sqlite:///linking-writing-process-to-writing-quality.db",
    study_name = "RandomForest"
)
study.optimize(objective, n_trials=20)