In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from Functionality import Remove_id,Reduce_text_change,Reduce_event,Reduce_activity,getX_Y,getModel,performCrossValidation,makePredictions,perfromGridSearch,performKfoldScore
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import optuna

In [3]:
# Directories Where the data is present
train_logs_directory = os.path.join("Data","train_logs.csv")
train_scores_directory = os.path.join("Data","train_scores.csv")
test_logs_directory = os.path.join("Data","test_logs.csv")

# Loading Dataset

In [4]:
# following the naming convention that everything related
# to train will be followed by train_ and everything followed by test will be followed by test_
train_logs_df = pd.read_csv(train_logs_directory)
test_logs_df = pd.read_csv(test_logs_directory)
train_scores_df = pd.read_csv(train_scores_directory)

In [5]:
train_df = pd.merge(train_logs_df,train_scores_df,on = "id",how = "inner")
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


# Exploratory Data Analysis

In [6]:
# No null values are present
# train_logs_df.isnull().sum()

In [7]:
train_df.dtypes

id                  object
event_id             int64
down_time            int64
up_time              int64
action_time          int64
activity            object
down_event          object
up_event            object
text_change         object
cursor_position      int64
word_count           int64
score              float64
dtype: object

In [8]:
train_df.columns

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')

In [9]:
train_df["up_event"].unique()

array(['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
       'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
       'ArrowUp', 'ArrowDown', 'm', 'Rightclick', 'i', 'o', 't', '=', 'a',
       'CapsLock', 'Control', 'c', 'v', '/', 'Delete', ':', 'z', '[', '$',
       '(', ')', '+', 'Home', 'End', '\\', 'Meta', '*', '&',
       'AudioVolumeMute', 'x', '!', 'Insert', 'MediaPlayPause', 'w',
       'NumLock', '%', 'V', 'b', '>', 'Alt', 'AudioVolumeUp',
       'ContextMenu', 'AudioVolumeDown', 'n', 'e', '<', 'PageDown', ']',
       'Middleclick', '@', 'F12', 'u', 'j', 's', '\x96', 'Dead', 'y', '{',
       'ScrollLock', '¿', 'p', 'Process', '}', 'MediaTrackPrevious',
       'MediaTrackNext', 'F3', '^', 'Unidentified', 'Cancel', 'h', '2',
       'd', 'r', '`', '\x9b', 'f', 'g', '#', '~', 'PageUp', 'l', 'T', 'A',
       'S', 'ModeChange', '_', 'Escape', 'F11', 'Unknownclick',
       'AltGraph', 'F10', 'F15', 'Clear', 'OS', 'C', 'Ä±', 'M', '|',
       'â\x80\x93', 

In [10]:
# Highly skewed dataset
train_df["activity"].unique()

array(['Nonproduction', 'Input', 'Remove/Cut', 'Replace',
       'Move From [284, 292] To [282, 290]',
       'Move From [287, 289] To [285, 287]',
       'Move From [460, 461] To [465, 466]', 'Paste',
       'Move From [905, 1314] To [907, 1316]',
       'Move From [565, 743] To [669, 847]',
       'Move From [669, 847] To [565, 743]',
       'Move From [1041, 1121] To [1496, 1576]',
       'Move From [1455, 1557] To [1323, 1425]',
       'Move From [2268, 2275] To [2247, 2254]',
       'Move From [213, 302] To [902, 991]',
       'Move From [0, 158] To [234, 392]',
       'Move From [460, 465] To [925, 930]',
       'Move From [810, 906] To [816, 912]',
       'Move From [186, 187] To [184, 185]',
       'Move From [140, 272] To [299, 431]',
       'Move From [114, 140] To [272, 298]',
       'Move From [1386, 1450] To [1445, 1509]',
       'Move From [442, 524] To [296, 378]',
       'Move From [408, 414] To [390, 396]',
       'Move From [1144, 1147] To [1142, 1145]',
       'Move 

In [11]:
train_df["activity"]

0          Nonproduction
1          Nonproduction
2          Nonproduction
3                  Input
4                  Input
               ...      
8405893    Nonproduction
8405894    Nonproduction
8405895          Replace
8405896    Nonproduction
8405897            Input
Name: activity, Length: 8405898, dtype: object

In [12]:
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


In [13]:
train_df["text_change"]

0          NoChange
1          NoChange
2          NoChange
3                 q
4                 q
             ...   
8405893    NoChange
8405894    NoChange
8405895      q => q
8405896    NoChange
8405897           .
Name: text_change, Length: 8405898, dtype: object

# Data Filteration

In [14]:
# Splitting the dataset between the categorical and numerical values
num_attributes = ["id","event_id","down_time","up_time","action_time","cursor_position","word_count"]
cat_attributes = ["activity","down_event","up_event","text_change"]

In [15]:
processing = ColumnTransformer([
    ("RemoveId",make_pipeline(Remove_id(),StandardScaler()),num_attributes),
     ("ValueSum",make_pipeline(Reduce_text_change()),["text_change"]),
    ("RemoveMove",make_pipeline(Reduce_activity(),OneHotEncoder(sparse_output=False)),["activity"]),
    ("ReduceUpEvents",make_pipeline(Reduce_event()),["up_event"]),
    # ("ReduceDownEvents",make_pipeline(Reduce_event()),["down_event"]),   
],
    # remainder="passthrough"
)

In [16]:
train_logs_df,y = getX_Y(train_logs_df,train_scores_df)
train_processed_numpy = processing.fit_transform(train_logs_df)
test_processed_numpy = processing.transform(test_logs_df)
train_processed_df = pd.DataFrame(train_processed_numpy,columns=processing.get_feature_names_out())
test_processed_df = pd.DataFrame(test_processed_numpy,columns=processing.get_feature_names_out())

In [17]:
train_processed_df

Unnamed: 0,RemoveId__Reduce_down_time,RemoveId__Reduce_up_time,RemoveId__Reduce_action_time,RemoveId__Reduce_cursor_position,RemoveId__Reduce_word_count,ValueSum__text_changed_text_change,RemoveMove__activity_changed_activity_Input,RemoveMove__activity_changed_activity_Move,RemoveMove__activity_changed_activity_Nonproduction,RemoveMove__activity_changed_activity_Paste,RemoveMove__activity_changed_activity_Remove/Cut,RemoveMove__activity_changed_activity_Replace,ReduceUpEvents__activity_changed_Punchuations,ReduceUpEvents__activity_changed_Characters,ReduceUpEvents__activity_changed_Numbers,ReduceUpEvents__activity_changed_Operations,ReduceUpEvents__activity_changed_Unknows
0,-1.532269,-1.532406,-0.264741,-1.289334,-1.315845,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.532207,-1.53162,1.207249,-1.289334,-1.315845,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.334102,-1.334299,-0.387078,-1.289334,-1.315845,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.333879,-1.333899,-0.02796,-1.28828,-1.31016,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-1.332888,-1.332838,0.114109,-1.287225,-1.31016,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8405893,2.467028,2.467812,1.570314,-0.202382,0.048498,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8405894,2.468102,2.467922,-0.387078,-0.202382,0.048498,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8405895,2.468412,2.468443,0.039128,-0.202382,0.048498,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
8405896,2.477208,2.47717,-0.098994,-0.205545,0.048498,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
def objective(trial):
    model,params = getModel("RandomForestRegressor",optuna=True,trial=trial)
    score = performKfoldScore(model,train_processed_df,train_logs_df,y,k=3,optuna = True,trial = trial)
    return score
    

In [20]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    storage = "sqlite:///linking-writing-process-to-writing-quality.db",
    study_name = "RandomForest"
)
study.optimize(objective, n_trials=20)

[I 2023-11-06 22:55:41,271] A new study created in RDB with name: RandomForest


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:56:08,008] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 162, 'max_depth': 29, 'min_samples_split': 233, 'min_samples_leaf': 200}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:56:33,911] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 96, 'max_depth': 9, 'min_samples_split': 64, 'min_samples_leaf': 267}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:56:59,645] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 230, 'max_depth': 23, 'min_samples_split': 55, 'min_samples_leaf': 293}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:57:27,769] Trial 3 finished with value: 0.0 and parameters: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 95, 'min_samples_leaf': 96}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:57:53,789] Trial 4 finished with value: 0.0 and parameters: {'n_estimators': 141, 'max_depth': 18, 'min_samples_split': 158, 'min_samples_leaf': 123}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:58:19,610] Trial 5 finished with value: 0.0 and parameters: {'n_estimators': 234, 'max_depth': 8, 'min_samples_split': 123, 'min_samples_leaf': 141}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:58:44,893] Trial 6 finished with value: 0.0 and parameters: {'n_estimators': 187, 'max_depth': 25, 'min_samples_split': 100, 'min_samples_leaf': 179}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:59:10,249] Trial 7 finished with value: 0.0 and parameters: {'n_estimators': 228, 'max_depth': 6, 'min_samples_split': 202, 'min_samples_leaf': 92}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 22:59:35,654] Trial 8 finished with value: 0.0 and parameters: {'n_estimators': 69, 'max_depth': 29, 'min_samples_split': 292, 'min_samples_leaf': 252}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 23:00:00,774] Trial 9 finished with value: 0.0 and parameters: {'n_estimators': 141, 'max_depth': 7, 'min_samples_split': 221, 'min_samples_leaf': 160}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 23:00:27,708] Trial 10 finished with value: 0.0 and parameters: {'n_estimators': 334, 'max_depth': 17, 'min_samples_split': 265, 'min_samples_leaf': 210}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 23:00:53,693] Trial 11 finished with value: 0.0 and parameters: {'n_estimators': 61, 'max_depth': 13, 'min_samples_split': 249, 'min_samples_leaf': 235}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 23:01:20,325] Trial 12 finished with value: 0.0 and parameters: {'n_estimators': 114, 'max_depth': 30, 'min_samples_split': 163, 'min_samples_leaf': 288}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00
Fold 3: 0.00
Mean Score: 0.00
Standard Deviation: 0.00


[I 2023-11-06 23:01:51,717] Trial 13 finished with value: 0.0 and parameters: {'n_estimators': 107, 'max_depth': 15, 'min_samples_split': 202, 'min_samples_leaf': 204}. Best is trial 0 with value: 0.0.


Fold 1: 0.00
Fold 2: 0.00


[W 2023-11-06 23:02:17,144] Trial 14 failed with parameters: {'n_estimators': 178, 'max_depth': 21, 'min_samples_split': 53, 'min_samples_leaf': 251} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/miniconda3/envs/kaggle/lib/python3.12/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/bt/crfk9c5d2lzd9dqtt8nyxy7c0000gn/T/ipykernel_58048/209572108.py", line 3, in objective
    score = performKfoldScore(model,train_processed_df,train_logs_df,y,k=3,optuna = True,trial = trial)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kirtankanani/Desktop/Python/kaggle/linking-writing-processes-to-writing-quality/Functionality/utils.py", line 51, in performKfoldScore
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                      ^^^^^^
  File "/opt/miniconda3

KeyboardInterrupt: 

In [None]:
model,params = getModel("RandomForestRegressor")

In [None]:
params

In [20]:
model

In [21]:
# scores = performCrossValidation(model,train_processed_df,train_logs_df,y)
scores,model,results = perfromGridSearch(model,params,train_processed_df,train_logs_df,y,results=True)

KeyboardInterrupt: 

In [None]:
scores

In [24]:
model

In [31]:
RandomForestRegressor(max_depth=30,max_features='sqrt',min_samples_leaf=4,min_samples_split=10,n_estimators=300)

In [25]:
# model.fit(train_processed_df,y)
dataset_train = makePredictions(model,train_processed_df,y,train_logs_df)

In [26]:
dataset_test = makePredictions(model,test_processed_df,y,test_logs_df)

In [33]:
dataset_train.to_csv("train.csv")

In [28]:
mean_squared_error(dataset_train["y_true"],dataset_train["y_pred"])

0.4108254833487863

In [29]:
mean_squared_error(dataset_test["y_true"],dataset_test["y_pred"])

0.2553856347662122

In [24]:
dataset_test["y_pred"].to_csv("submission.csv")