In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv
/kaggle/input/model-lgb/lgb_opt_11182023
/kaggle/input/cat-dict/cat_dict


In [2]:
from sklearn.metrics import mean_squared_error, r2_score
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
# Sometimes we want to see more than a single output in the cell below command enables that
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [4]:
from lightgbm import LGBMRegressor

In [5]:
train = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
test = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
train_scores = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")
sample_submission = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv")

In [6]:
train['id'] = train['id'].astype(str)
train_scores['id'] = train_scores['id'].astype(str)
# Add the score
train=train.merge(train_scores,left_on ="id",right_on="id",how="left")

In [7]:
# Sample to be divided into train and test 
# but we need to be carefull random sampling may lead a to an id being present in both train & test
# number of unique results we need in test
len(train.id.unique())*.33

815.4300000000001

In [8]:
from random import sample
test_id=sample(list(train.id.unique()),100)

In [9]:
# Get train and test based on test id
train_m= train[~train.id.isin(test_id)]
test_m= train[train.id.isin(test_id)]

In [10]:
train_m.shape[0]/train.shape[0]

0.9560797668494193

In [11]:
train_m.loc[:,'text_change_length'] = train_m['text_change'].apply(lambda x:len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_m.loc[:,'text_change_length'] = train_m['text_change'].apply(lambda x:len(x))


In [12]:
train_m.loc[:,'activity_c']=train_m['activity'].apply(lambda x:"move" if "Move" in x else x )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_m.loc[:,'activity_c']=train_m['activity'].apply(lambda x:"move" if "Move" in x else x )


In [13]:
# let's create a dictionary to store the factorisers for categorical variables
cats=['activity_c','down_event','up_event']
# cat_dict={}
# for i in cats:
#     cat_dict[i]={}
#     for j in range(0,len(list(train_m[i].unique()))):
#         cat_dict[i][list(train_m[i].unique())[j]]=j

In [14]:
import joblib
#joblib.dump(cat_dict,"cat_dict")

In [15]:
model_vars=['event_id', 'down_time', 'up_time', 'action_time',
       'down_event', 'up_event', 'cursor_position',
       'word_count','text_change_length', 'activity_c']

In [16]:
cat_dict=joblib.load("/kaggle/input/cat-dict/cat_dict")

In [17]:
X_train=train_m.copy()

In [18]:
for i in cats:
    X_train[i]=X_train[i].map(cat_dict[i])

In [19]:
def pre_pro(df):
    df.loc[:,'text_change_length'] = df['text_change'].apply(lambda x:len(x))
    df.loc[:,'activity_c']=df['activity'].apply(lambda x:"move" if "Move" in x else x )
    for i in ['activity_c', 'down_event', 'up_event']:
        df.loc[:,i]=df[i].map(cat_dict[i])
        
    return df

In [20]:
X_test=pre_pro(test_m)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'text_change_length'] = df['text_change'].apply(lambda x:len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'activity_c']=df['activity'].apply(lambda x:"move" if "Move" in x else x )


In [21]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
X_test['down_event']=pd.to_numeric(X_test['down_event'])
X_test['up_event']=pd.to_numeric(X_test['up_event'])
X_test['activity_c']=pd.to_numeric(X_test['activity_c'])

In [23]:
shift_vars=['down_time', 'up_time', 'action_time', 'down_event',
       'up_event', 'cursor_position', 'word_count', 'text_change_length',
       'activity_c']

for i in shift_vars:
    #for j in range(0,60):
    X_train[i+"_" +"1"]=X_train[i].shift(1)
    X_train[i+"_" +"2"]=X_train[i].shift(2)
    X_train[i+"_" +"3"]=X_train[i].shift(3)

In [24]:
for i in shift_vars:
    #for j in range(0,60):
    X_test[i+"_" +"1"]=X_test[i].shift(1)
    X_test[i+"_" +"2"]=X_test[i].shift(2)
    X_test[i+"_" +"3"]=X_test[i].shift(3)

In [25]:
model_vars=[ 'event_id', 'down_time', 'up_time', 'action_time',
       'down_event', 'up_event', 'cursor_position',
       'word_count', 'text_change_length', 'activity_c',
       'down_time_1', 'down_time_2', 'down_time_3', 'up_time_1', 'up_time_2',
       'up_time_3', 'action_time_1', 'action_time_2', 'action_time_3',
       'down_event_1', 'down_event_2', 'down_event_3', 'up_event_1',
       'up_event_2', 'up_event_3', 'cursor_position_1', 'cursor_position_2',
       'cursor_position_3', 'word_count_1', 'word_count_2', 'word_count_3',
       'text_change_length_1', 'text_change_length_2', 'text_change_length_3',
       'activity_c_1', 'activity_c_2', 'activity_c_3']

In [26]:
for i in model_vars:
    X_train.loc[:,i]=X_train[i].astype("float")
    X_test.loc[:,i]=X_test[i].astype("float")

In [27]:
cats=['down_event', 'up_event', 'activity_c',
       'down_event_1', 'down_event_2', 'down_event_3', 'up_event_1',
       'up_event_2', 'up_event_3','text_change_length_1', 'text_change_length_2', 
       'text_change_length_3','activity_c_1', 'activity_c_2', 'activity_c_3']
lgb=LGBMRegressor()
lgb.fit(X_train[model_vars], X_train['score'],
        eval_set=[(X_test[model_vars], X_test['score'])],
        categorical_feature=cats,
        verbose=0,
        early_stopping_rounds=8)

In [28]:
mean_squared_error(X_test['score'],lgb.predict(X_test[model_vars]))

0.5062230373943213

In [29]:
X_test_v=pre_pro(test)

for i in shift_vars:
    #for j in range(0,60):
    X_test_v[i+"_" +"1"]=X_test_v[i].shift(1)
    X_test_v[i+"_" +"2"]=X_test_v[i].shift(2)
    X_test_v[i+"_" +"3"]=X_test_v[i].shift(3)

In [30]:
X_test_v['down_event']=pd.to_numeric(X_test_v['down_event'])
X_test_v['down_event_1']=pd.to_numeric(X_test_v['down_event_1'])
X_test_v['down_event_2']=pd.to_numeric(X_test_v['down_event_2'])
X_test_v['down_event_3']=pd.to_numeric(X_test_v['down_event_3'])

X_test_v['up_event']=pd.to_numeric(X_test_v['up_event'])
X_test_v['up_event_1']=pd.to_numeric(X_test_v['up_event_1'])
X_test_v['up_event_2']=pd.to_numeric(X_test_v['up_event_2'])
X_test_v['up_event_3']=pd.to_numeric(X_test_v['up_event_3'])

X_test_v['activity_c']=pd.to_numeric(X_test_v['activity_c'])
X_test_v['activity_c_1']=pd.to_numeric(X_test_v['activity_c_1'])
X_test_v['activity_c_2']=pd.to_numeric(X_test_v['activity_c_2'])
X_test_v['activity_c_3']=pd.to_numeric(X_test_v['activity_c_3'])

for i in model_vars:
    X_test_v.loc[:,i]=X_test_v[i].astype("float")

In [31]:
X_test_v['score']=lgb.predict(X_test_v[model_vars])

In [32]:
# import math
# def fit_lgb(params):
#     lgb=LGBMRegressor(
#         n_estimators=int(params['n_estimators']),
#         num_leaves=int(params['num_leaves']),
#         max_depth=int(params['max_depth']),
#         min_child_samples=int(params['min_child_samples']),
#         subsample=params['subsample'],
#         colsample_bytree=params['colsample_bytree'],
#         scale_pos_weight=params['scale_pos_weight'],
#         learning_rate=params['learning_rate'],
#         reg_alpha=params['reg_alpha'],
#         reg_lambda=params['reg_lambda'],
#         random_state=0
#     )
    
#     lgb.fit(X=X_train[model_vars],
#             y=X_train['score'],
#             categorical_feature=cats)
    
#     y_pred =lgb.predict(X_test[model_vars])
    
    
#     try:
#         mse=mean_squared_error(X_test['score'],y_pred)
#     except:
#         mse=0
    
#     loss=mse
        
#     return {'loss':loss,'params':params,'status':STATUS_OK,'model':lgb}

In [33]:
# space= {
#     'n_estimators':hp.quniform('n_estimators',100,1000,100),
#     'num_leaves':hp.quniform('num_leaves',10,100,1),
#     'max_depth':hp.quniform('max_depth',5,25,1),
#     'min_child_samples':hp.quniform('min_child_samples',10,500,5),
#     'subsample':hp.uniform('subsample',0.0,1.0),
#     'colsample_bytree':hp.uniform('colsample_bytree',0.3,1.0),
#     'scale_pos_weight':hp.quniform('scale_pos_weight',1,40,1),
#     'learning_rate':hp.loguniform('learning_rate',math.log(0.001),math.log(4)),
#     'reg_alpha':hp.uniform('reg_alpha',0.0,1.0),
#     'reg_lambda':hp.uniform('reg_lambda',0.0,1.0)
# }

# max_evals=50

# trials=Trials()

# best_params =fmin(
#      fn=fit_lgb,
#      space=space,
#      algo=tpe.suggest,
#      max_evals=max_evals,
#      trials=trials
# )

# # fit model with the optmized parameters
# results= fit_lgb(best_params)
# mse_opt=results['loss']

# print(f'R2 (test data,,optimized): {mse_opt:.1%}')

In [34]:
#joblib.dump(results['model'],"lgb_opt_11182023")

In [35]:
#lgb_opt_11182023= joblib.load("/kaggle/input/model-lgb/lgb_opt_11182023")

In [36]:
try:
    # Code for generating submission file
    prediction=X_test_v.groupby('id').tail(1)[['id','score']]#X_test_v.groupby('id').agg({'score':"median"}).reset_index()
    print("yaba daba")
    prediction['score'] = np.clip(prediction['score'], a_min=0.5, a_max=6.0)
    print("dooooooo")
    prediction['id']=prediction['id'].astype("str")
    prediction['score']=prediction['score'].astype("float")
    prediction.to_csv('submission.csv', index=False)
except Exception as e:
    print(f"Error during submission: {e}")


yaba daba
dooooooo
