In [49]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import make_pipeline

In [2]:
# Directories Where the data is present
train_logs_directory = os.path.join("Data","train_logs.csv")
train_scores_directory = os.path.join("Data","train_scores.csv")
test_logs_directory = os.path.join("Data","test_logs.csv")

# Loading Dataset

In [3]:
# following the naming convention that everything related
# to train will be followed by train_ and everything followed by test will be followed by test_
train_logs_df = pd.read_csv(train_logs_directory)
test_logs_df = pd.read_csv(test_logs_directory)
train_scores_df = pd.read_csv(train_scores_directory)

In [9]:
train_df = pd.merge(train_logs_df,train_scores_df,on = "id",how = "inner")
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


# Exploratory Data Analysis

In [13]:
# No null values are present
# train_logs_df.isnull().sum()

In [16]:
train_df.dtypes

id                  object
event_id             int64
down_time            int64
up_time              int64
action_time          int64
activity            object
down_event          object
up_event            object
text_change         object
cursor_position      int64
word_count           int64
score              float64
dtype: object

In [17]:
train_df.columns

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')

In [18]:
# Highly skewed dataset
train_df["activity"].value_counts()

activity
Input                                     6726796
Remove/Cut                                 970158
Nonproduction                              703851
Replace                                      4448
Paste                                         599
Move From [1306, 1371] To [1061, 1126]          2
Move From [13, 65] To [9, 61]                   1
Move From [274, 314] To [299, 339]              1
Move From [624, 625] To [845, 846]              1
Move From [1861, 2063] To [1766, 1968]          1
Move From [1766, 1968] To [1861, 2063]          1
Move From [2091, 2179] To [252, 340]            1
Move From [923, 1077] To [340, 494]             1
Move From [0, 1] To [590, 591]                  1
Move From [999, 1000] To [1000, 1001]           1
Move From [0, 75] To [1, 76]                    1
Move From [1651, 1769] To [1565, 1683]          1
Move From [61, 136] To [0, 75]                  1
Move From [623, 632] To [624, 633]              1
Move From [75, 134] To [304, 363]        

In [19]:
train_df["activity"]

0          Nonproduction
1          Nonproduction
2          Nonproduction
3                  Input
4                  Input
               ...      
8405893    Nonproduction
8405894    Nonproduction
8405895          Replace
8405896    Nonproduction
8405897            Input
Name: activity, Length: 8405898, dtype: object

In [20]:
train_df.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,3.5


In [26]:
train_df["activity"].unique()

array(['Nonproduction', 'Input', 'Remove/Cut', 'Replace',
       'Move From [284, 292] To [282, 290]',
       'Move From [287, 289] To [285, 287]',
       'Move From [460, 461] To [465, 466]', 'Paste',
       'Move From [905, 1314] To [907, 1316]',
       'Move From [565, 743] To [669, 847]',
       'Move From [669, 847] To [565, 743]',
       'Move From [1041, 1121] To [1496, 1576]',
       'Move From [1455, 1557] To [1323, 1425]',
       'Move From [2268, 2275] To [2247, 2254]',
       'Move From [213, 302] To [902, 991]',
       'Move From [0, 158] To [234, 392]',
       'Move From [460, 465] To [925, 930]',
       'Move From [810, 906] To [816, 912]',
       'Move From [186, 187] To [184, 185]',
       'Move From [140, 272] To [299, 431]',
       'Move From [114, 140] To [272, 298]',
       'Move From [1386, 1450] To [1445, 1509]',
       'Move From [442, 524] To [296, 378]',
       'Move From [408, 414] To [390, 396]',
       'Move From [1144, 1147] To [1142, 1145]',
       'Move 

In [24]:
train_df["up_event"].unique()

array(['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
       'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
       'ArrowUp', 'ArrowDown', 'm', 'Rightclick', 'i', 'o', 't', '=', 'a',
       'CapsLock', 'Control', 'c', 'v', '/', 'Delete', ':', 'z', '[', '$',
       '(', ')', '+', 'Home', 'End', '\\', 'Meta', '*', '&',
       'AudioVolumeMute', 'x', '!', 'Insert', 'MediaPlayPause', 'w',
       'NumLock', '%', 'V', 'b', '>', 'Alt', 'AudioVolumeUp',
       'ContextMenu', 'AudioVolumeDown', 'n', 'e', '<', 'PageDown', ']',
       'Middleclick', '@', 'F12', 'u', 'j', 's', '\x96', 'Dead', 'y', '{',
       'ScrollLock', '¿', 'p', 'Process', '}', 'MediaTrackPrevious',
       'MediaTrackNext', 'F3', '^', 'Unidentified', 'Cancel', 'h', '2',
       'd', 'r', '`', '\x9b', 'f', 'g', '#', '~', 'PageUp', 'l', 'T', 'A',
       'S', 'ModeChange', '_', 'Escape', 'F11', 'Unknownclick',
       'AltGraph', 'F10', 'F15', 'Clear', 'OS', 'C', 'Ä±', 'M', '|',
       'â\x80\x93', 

# Data Filteration

In [32]:
max(train_df.event_id)

12876

In [31]:
train_df.id

0          001519c8
1          001519c8
2          001519c8
3          001519c8
4          001519c8
             ...   
8405893    fff05981
8405894    fff05981
8405895    fff05981
8405896    fff05981
8405897    fff05981
Name: id, Length: 8405898, dtype: object

In [28]:
train_df.columns # id_event_id -> id * 100000 + event_id

Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')

In [40]:
# Splitting the dataset between the categorical and numerical values
num_attributes = ["id","event_id","down_time","up_time","action_time","cursor_position","word_count"]
cat_attributes = ["activity","down_event","up_event","text_change"]


In [41]:
num_pipeline = ()
cat_pipeline = ()

In [72]:
class Reduce(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        """Remove the id column"""
        self.remove_column = "id"
        return self
    
    def transform(self,X):
        print("X before entering ", X.columns)
        if self.remove_column in X.columns:
            X = X.drop(columns = self.remove_column)
        print("X after ", X.shape)
        return X
        

In [73]:
temp = make_pipeline(Reduce())

In [74]:
temp.fit_transform(train_df)

X before entering  Index(['id', 'event_id', 'down_time', 'up_time', 'action_time', 'activity',
       'down_event', 'up_event', 'text_change', 'cursor_position',
       'word_count', 'score'],
      dtype='object')
X after  (8405898, 11)


Unnamed: 0,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,score
0,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
1,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,3.5
2,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,3.5
3,4,106686,106777,91,Input,q,q,q,1,1,3.5
4,5,107196,107323,127,Input,q,q,q,2,1,3.5
...,...,...,...,...,...,...,...,...,...,...,...
8405893,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240,4.0
8405894,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240,4.0
8405895,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240,4.0
8405896,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240,4.0


In [77]:
processing = ColumnTransformer([
    ("Remove id",make_pipeline(Reduce()), num_attributes)
])

In [78]:
processing.fit_transform(train_df)

X before entering  Index(['id', 'event_id', 'down_time', 'up_time', 'action_time',
       'cursor_position', 'word_count'],
      dtype='object')
X after  (8405898, 6)


array([[      1,    4526,    4557,      31,       0,       0],
       [      2,    4558,    4962,     404,       0,       0],
       [      3,  106571,  106571,       0,       0,       0],
       ...,
       [   3617, 2064657, 2064765,     108,    1031,     240],
       [   3618, 2069186, 2069259,      73,    1028,     240],
       [   3619, 2070065, 2070133,      68,    1029,     240]])

In [None]:
train_df = train_df.drop("id")

In [33]:
train_df.dtypes

id                  object
event_id             int64
down_time            int64
up_time              int64
action_time          int64
activity            object
down_event          object
up_event            object
text_change         object
cursor_position      int64
word_count           int64
score              float64
dtype: object