### import

In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

### Data

In [2]:
data = pd.read_csv("telemetry_2021fall.csv", na_values=np.nan)

In [3]:
data.head()

Unnamed: 0,index,childId,date,time,chapter,level,session,serialNumber,attempt,trial,...,solType,repoBefore,repoAfter,repoBeforeEdit,repoAfterEdit,avtxyz,pathDist2opt,step,inxOptPath,stepsOptPath
0,926245,301,2021-09-24,08:11:13,1.0,1.0,2.0,12,0.0,1.0,...,,,,,,,,,,
1,926244,301,2021-09-24,08:11:13,1.0,1.0,2.0,13,1.0,1.0,...,,,,,,,,,,
2,926196,301,2021-09-24,08:11:17,1.0,1.0,2.0,14,1.0,1.0,...,1.0,,['Walk.Right'],,Walk.R,,,,,
3,926169,301,2021-09-24,08:11:19,1.0,1.0,2.0,15,1.0,1.0,...,1.0,['Walk.Right'],,Walk.R,,"0.00,0.02,0.00",,,,
4,926168,301,2021-09-24,08:11:19,1.0,1.0,2.0,17,1.0,1.0,...,,,,,,"0.00,0.02,0.00",0.0,0.0,0.0,2.0


data dictionary: https://docs.google.com/spreadsheets/d/1ettljIuY46tLbEZXc-H_ktKAatYMLsBYH2KKca-FkXM/edit#gid=0

In [4]:
data.shape

(401940, 33)

In [5]:
data.columns

Index(['index', 'childId', 'date', 'time', 'chapter', 'level', 'session',
       'serialNumber', 'attempt', 'trial', 'pause_s', 'sessionTimestamp',
       'sms', 'eventDescription', 'param', 'avtPos', 'distChange',
       'dist2optMax', 'dist2optBefore', 'dist2optAfter', 'nofCmdsBest',
       'nofCmdsBefore', 'nofCmdsAfter', 'solType', 'repoBefore', 'repoAfter',
       'repoBeforeEdit', 'repoAfterEdit', 'avtxyz', 'pathDist2opt', 'step',
       'inxOptPath', 'stepsOptPath'],
      dtype='object')

In [6]:
# dealing with NaN
data.isnull().sum()

index                    0
childId                  0
date                     0
time                     0
chapter                216
level                  216
session                  0
serialNumber             0
attempt                  0
trial                    0
pause_s                747
sessionTimestamp         0
sms                      0
eventDescription         0
param               264916
avtPos               78574
distChange          299980
dist2optMax         299980
dist2optBefore      299980
dist2optAfter       299980
nofCmdsBest         299980
nofCmdsBefore       299980
nofCmdsAfter        299980
solType             284333
repoBefore          272059
repoAfter           303233
repoBeforeEdit      272059
repoAfterEdit       303233
avtxyz              137242
pathDist2opt        280450
step                280450
inxOptPath          280450
stepsOptPath        280450
dtype: int64

In [7]:
df = data.copy()

In [8]:
df.drop(columns = ['index', 'param', 'avtPos', 'distChange',
       'dist2optMax', 'dist2optBefore', 'dist2optAfter', 'nofCmdsBest',
       'nofCmdsBefore', 'nofCmdsAfter', 'solType', 'repoBefore', 'repoAfter',
       'repoBeforeEdit', 'repoAfterEdit', 'avtxyz', 'pathDist2opt', 'step',
       'inxOptPath', 'stepsOptPath'], inplace = True)

In [9]:
df.head()

Unnamed: 0,childId,date,time,chapter,level,session,serialNumber,attempt,trial,pause_s,sessionTimestamp,sms,eventDescription
0,301,2021-09-24,08:11:13,1.0,1.0,2.0,12,0.0,1.0,,767832.907,0.0,PuzzleIntroDone
1,301,2021-09-24,08:11:13,1.0,1.0,2.0,13,1.0,1.0,0.1,767841.254,0.0,PuzzleStart
2,301,2021-09-24,08:11:17,1.0,1.0,2.0,14,1.0,1.0,3.8,771640.313,3.8,CommandAdded
3,301,2021-09-24,08:11:19,1.0,1.0,2.0,15,1.0,1.0,2.0,773623.663,5.8,StartExecuteTrigger
4,301,2021-09-24,08:11:19,1.0,1.0,2.0,17,1.0,1.0,0.2,773751.507,5.9,ExecuteTriggerCommand


In [10]:
# fig, ax = plt.subplots(3, 3, figsize = (20,15))
# df['neighbourhood'].value_counts().head(30).plot(ax=ax[0][0], kind='bar')
# df['neighbourhood_group'].value_counts().head(10).plot(ax=ax[0][1], kind='bar')
# df['minimum_nights'].value_counts().head(10).plot(ax=ax[0][2], kind='bar')
# df['number_of_reviews'].value_counts().head(10).plot(ax=ax[1][0], kind='bar')
# df['number_of_host_listings'].value_counts().head(10).plot(ax=ax[1][1], kind='bar')
# df['availability'].value_counts().head(10).plot(ax=ax[1][2], kind='bar')
# df['reviews_year'].value_counts().head(10).plot(ax=ax[2][0], kind='bar')
# plt.legend()

In [11]:
for col in df.columns:
    print('{} : {} unique values'.format(col, len(df[col].unique())))

childId : 73 unique values
date : 59 unique values
time : 29306 unique values
chapter : 6 unique values
level : 18 unique values
session : 37 unique values
serialNumber : 3527 unique values
attempt : 31 unique values
trial : 197 unique values
pause_s : 1881 unique values
sessionTimestamp : 401894 unique values
sms : 5014 unique values
eventDescription : 15 unique values
