In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from Functionality import Reduce_numerical_columns,Reduce_text_change,Reduce_event,Reduce_activity,getX_Y,getModel,performCrossValidation,makePredictions,perfromGridSearch,performKfoldScore,Aggregation,ConcatAlongId
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.model_selection import train_test_split

In [None]:
# Directories Where the data is present
train_logs_directory = os.path.join("Data","train_logs.csv")
train_scores_directory = os.path.join("Data","train_scores.csv")
test_logs_directory = os.path.join("Data","test_logs.csv")

# Loading Dataset

In [None]:
# following the naming convention that everything related
# to train will be followed by train_ and everything followed by test will be followed by test_
train_logs_df = pd.read_csv(train_logs_directory)
test_logs_df = pd.read_csv(test_logs_directory)
train_scores_df = pd.read_csv(train_scores_directory)

In [None]:
train_df = pd.merge(train_logs_df,train_scores_df,on = "id",how = "inner")
train_df.head()

# Exploratory Data Analysis

In [None]:
# No null values are present
# train_logs_df.isnull().sum()

In [None]:
train_df.dtypes

In [None]:
train_df.columns

In [None]:
train_df["up_event"].unique()

In [None]:
# Highly skewed dataset
train_df["activity"].unique()

In [None]:
train_df["activity"]

In [None]:
train_df.head()

In [None]:
train_df["text_change"]

# Data Filteration

In [None]:
# Splitting the dataset between the categorical and numerical values
num_attributes = ["id","event_id","down_time","up_time","action_time","cursor_position","word_count"]
cat_attributes = ["activity","down_event","up_event","text_change"]

In [None]:
processing = ColumnTransformer([
    ("RemoveId",make_pipeline(Reduce_numerical_columns()),num_attributes),
     ("ValueSum",make_pipeline(Reduce_text_change()),["text_change"]),
    ("RemoveMove",make_pipeline(Reduce_activity(),OneHotEncoder(sparse_output=False)),["activity"]),
    ("ReduceUpEvents",make_pipeline(Reduce_event()),["up_event"]),
    # ("ReduceDownEvents",make_pipeline(Reduce_event()),["down_event"]),   
],
    # remainder="passthrough"
)

In [None]:
train_logs_df,y = getX_Y(train_logs_df,train_scores_df,perform_harmonic_variation=False,aggregation=True)
train_processed_numpy = processing.fit_transform(train_logs_df)
test_processed_numpy = processing.transform(test_logs_df)
train_processed_df = pd.DataFrame(train_processed_numpy,columns=processing.get_feature_names_out())
test_processed_df = pd.DataFrame(test_processed_numpy,columns=processing.get_feature_names_out())

In [None]:
post_processing = make_pipeline(Aggregation())

In [None]:
# Concating the columns 
train_postprocessed_df = ConcatAlongId(train_processed_df,train_logs_df)
test_postprocessed_df = ConcatAlongId(test_processed_df,test_logs_df)

# Aggreagating the columns for both train and test
train_postprocessed_numpy = post_processing.fit_transform(train_postprocessed_df)
train_postprocessed_df = pd.DataFrame(train_postprocessed_numpy,columns=post_processing.get_feature_names_out())
test_postprocessed_numpy = post_processing.fit_transform(test_postprocessed_df)
test_postprocessed_df = pd.DataFrame(test_postprocessed_numpy,columns=post_processing.get_feature_names_out())

In [None]:
models = []
# Add first model
model = RandomForestRegressor(n_estimators=900,      # You can tune this
                              max_depth=300,          # You can tune this
                              min_samples_split=10,   # You can tune this
                              min_samples_leaf=10,    # You can tune this
                              max_features='sqrt',   # You can tune this
                              bootstrap=True,        # You can tune this
                              oob_score=True,        # Monitor OOB score
                              random_state=42)
models.append(model)

# Add second model 
model = RandomForestRegressor(n_estimators=900,      # You can tune this
                              max_depth=300,          # You can tune this
                              min_samples_split=10,   # You can tune this
                              min_samples_leaf=10,    # You can tune this
                              max_features='sqrt',   # You can tune this
                              bootstrap=True,        # You can tune this
                              oob_score=True,        # Monitor OOB score
                              random_state=42)
models.append(model)

In [None]:
for i,model in enumerate(models): 
    results = performCrossValidation(model,train_postprocessed_df,train_logs_df,y,aggregation=True)
    results.to_csv(f"{i}.csv")
    model.fit(train_postprocessed_df,y)
    dataset_train = makePredictions(model,train_postprocessed_df,train_logs_df,aggregation=True)
    t = mean_squared_error(y,dataset_train)
    print(f"{i} ----------> {t}")  
    print()

In [None]:
results = performCrossValidation(model,train_postprocessed_df,train_logs_df,y,aggregation=True)

In [None]:
results

In [None]:
model.fit(train_postprocessed_df,y)

In [None]:
dataset_train = makePredictions(model,train_postprocessed_df,train_logs_df,aggregation=True)

In [None]:
dataset_test = makePredictions(model,test_postprocessed_df,test_logs_df,aggregation=True) 

In [None]:
mean_squared_error(y,dataset_train)

In [None]:
# For submission
dataset_test["y_pred"].to_csv("submission.csv")

# For Model Optimization

In [None]:
model,params = getModel("RandomForestRegressor")

In [None]:
# scores = performCrossValidation(model,train_processed_df,train_logs_df,y)
model,results = perfromGridSearch(model,params,train_postprocessed_df,train_logs_df,y,results=True,aggregation=True)

In [None]:
# model.fit(train_processed_df,y)
dataset_train = makePredictions(model,train_postprocessed_df,train_logs_df,aggregation=True)

In [None]:
dataset_test = makePredictions(model,test_processed_df,test_logs_df,aggregation=True)

In [None]:
dataset_train.to_csv("train.csv")

In [None]:
mean_squared_error(dataset_train["y_true"],dataset_train["y_pred"])

In [None]:
dataset_test["y_pred"].to_csv("submission.csv")

# Optimize with Optuna

In [None]:
def objective(trial):
    model,params = getModel("RandomForestRegressor",optuna=True,trial=trial)
    score = performKfoldScore(model,train_processed_df,train_logs_df,y,k=3,optuna = True,trial = trial)
    params[0]['n_jobs']=[-1]
    return score
    

In [None]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    storage = "sqlite:///linking-writing-process-to-writing-quality.db",
    study_name = "RandomForest"
)
study.optimize(objective, n_trials=20)