In [5]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [6]:
# Directories Where the data is present
train_logs_directory = os.path.join("Data","train_logs.csv")
train_scores_directory = os.path.join("Data","train_scores.csv")
test_logs_directory = os.path.join("Data","test_logs.csv")

# Loading Dataset

In [7]:
# following the naming convention that everything related
# to train will be followed by train_ and everything followed by test will be followed by test_
train_logs_df = pd.read_csv(train_logs_directory)
test_logs_df = pd.read_csv(test_logs_directory)
train_scores_df = pd.read_csv(train_scores_directory)

In [8]:
train_df = pd.merge(train_logs_df,train_scores_df,on = "id",how = "inner")
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


# Exploratory Data Analysis

In [9]:
# No null values are present
# train_logs_df.isnull().sum()

In [10]:
train_df.dtypes

id                  object
event_id             int64
down_time            int64
up_time              int64
action_time          int64
activity            object
down_event          object
up_event            object
text_change         object
cursor_position      int64
word_count           int64
score              float64
dtype: object

In [11]:
train_df.columns

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')

In [12]:
# Highly skewed dataset
train_df["activity"].value_counts()

activity
Input                                     6726796
Remove/Cut                                 970158
Nonproduction                              703851
Replace                                      4448
Paste                                         599
Move From [1306, 1371] To [1061, 1126]          2
Move From [13, 65] To [9, 61]                   1
Move From [274, 314] To [299, 339]              1
Move From [624, 625] To [845, 846]              1
Move From [1861, 2063] To [1766, 1968]          1
Move From [1766, 1968] To [1861, 2063]          1
Move From [2091, 2179] To [252, 340]            1
Move From [923, 1077] To [340, 494]             1
Move From [0, 1] To [590, 591]                  1
Move From [999, 1000] To [1000, 1001]           1
Move From [0, 75] To [1, 76]                    1
Move From [1651, 1769] To [1565, 1683]          1
Move From [61, 136] To [0, 75]                  1
Move From [623, 632] To [624, 633]              1
Move From [75, 134] To [304, 363]        

In [13]:
train_df["activity"]

0          Nonproduction
1          Nonproduction
2          Nonproduction
3                  Input
4                  Input
               ...      
8405893    Nonproduction
8405894    Nonproduction
8405895          Replace
8405896    Nonproduction
8405897            Input
Name: activity, Length: 8405898, dtype: object

In [14]:
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


# Data Filteration

In [21]:
# Splitting the dataset between the categorical and numerical values
num_attributes = ["id","event_id","down_time","up_time","action_time","cursor_position","word_count"]
cat_attributes = ["activity","down_event","up_event","text_change"]


In [22]:
num_pipeline = ()
cat_pipeline = ()

In [80]:
class Remove_id(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        """Remove the id column"""
        self.remove_column = "id"
        return self
    
    def transform(self,X):
        if self.remove_column in X.columns:
            X = X.drop(columns = self.remove_column)
        return X.to_numpy()

    def get_feature_names_out(self, input_features=None):
        return [f"Reduce_{i}" for i in input_features][1:]

In [81]:
class Reduce_text_change(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass

    def fit(self,X,y=None):
        """Remove the id column"""
        return self

    def transform(self,X):
        # X is the column here
        final = []
        for element in X["text_change"]:
            # check if => symbol exist 
            if "=>" in element:
                left,right = element.split("=>")
                final.append(len(right) - len(left))
            elif element == "NoChange":
                final.append(0)
            else:
                final.append(len(element))
        temp = pd.concat([X,pd.Series(final)],axis = 1).to_numpy()
        return temp

    def get_feature_names_out(self, input_features=None):
        input_features = np.append(input_features,"New_dawn")
        return [f"text_changed_{i}" for i in input_features]

In [82]:
processing = ColumnTransformer([
    ("RemoveId",make_pipeline(Remove_id()), num_attributes),
    # ("cat", make_pipeline(OneHotEncoder(handle_unknown="ignore",sparse_output=False)),cat_attributes),
    ("ValueSum",make_pipeline(Reduce_text_change()),["text_change"])
])

In [87]:
temp = processing.fit_transform(train_df)

In [86]:
pd.DataFrame(temp,columns=processing.get_feature_names_out())

Unnamed: 0,RemoveId__Reduce_event_id,RemoveId__Reduce_down_time,RemoveId__Reduce_up_time,RemoveId__Reduce_action_time,RemoveId__Reduce_cursor_position,RemoveId__Reduce_word_count,ValueSum__text_changed_text_change,ValueSum__text_changed_New_dawn
0,1,4526,4557,31,0,0,NoChange,0
1,2,4558,4962,404,0,0,NoChange,0
2,3,106571,106571,0,0,0,NoChange,0
3,4,106686,106777,91,1,1,q,1
4,5,107196,107323,127,2,1,q,1
...,...,...,...,...,...,...,...,...
8405893,3615,2063944,2064440,496,1031,240,NoChange,0
8405894,3616,2064497,2064497,0,1031,240,NoChange,0
8405895,3617,2064657,2064765,108,1031,240,q => q,0
8405896,3618,2069186,2069259,73,1028,240,NoChange,0
