### import

In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

### Data

In [2]:
data = pd.read_csv("telemetry_2021fall.csv", na_values=np.nan)
codebook = pd.read_excel("codeSpark_telemetry_codebook.xlsx")
df_nonan = data.dropna()
df_nonan.shape

(0, 33)

In [3]:
data.head()

Unnamed: 0,index,childId,date,time,chapter,level,session,serialNumber,attempt,trial,...,solType,repoBefore,repoAfter,repoBeforeEdit,repoAfterEdit,avtxyz,pathDist2opt,step,inxOptPath,stepsOptPath
0,926245,301,2021-09-24,08:11:13,1.0,1.0,2.0,12,0.0,1.0,...,,,,,,,,,,
1,926244,301,2021-09-24,08:11:13,1.0,1.0,2.0,13,1.0,1.0,...,,,,,,,,,,
2,926196,301,2021-09-24,08:11:17,1.0,1.0,2.0,14,1.0,1.0,...,1.0,,['Walk.Right'],,Walk.R,,,,,
3,926169,301,2021-09-24,08:11:19,1.0,1.0,2.0,15,1.0,1.0,...,1.0,['Walk.Right'],,Walk.R,,"0.00,0.02,0.00",,,,
4,926168,301,2021-09-24,08:11:19,1.0,1.0,2.0,17,1.0,1.0,...,,,,,,"0.00,0.02,0.00",0.0,0.0,0.0,2.0


In [4]:
codebook.head()

Unnamed: 0,Name,Description,Rule or Formula,Values
0,index,unique row number created in python,,
1,childId,Internal Child ID,internal mapping,100-199: data collection 2019-20\n200-299: dat...
2,chapter,codeSpark chapter,as defined by codeSpark,1: Donut Detective Chapter\n2: ...\n3: ...\n4:...
3,level,codeSpark level,as defined by codeSpark,"1-19, each chapter has a different number of l..."
4,session,codeSpark session,as defined by codeSpark: a session starts at a...,"1, 2, 3 .."


In [5]:
codebook["Description"]

0                   unique row number created in python
1                                     Internal Child ID
2                                     codeSpark chapter
3                                       codeSpark level
4                                     codeSpark session
5     Part of a session, an attempt includes startin...
6     Part of an attempt. Each edit/execute sequence...
7     Timestamps used to link the gameplay telemetry...
8         event time in Pacific Time in format HH:mm:ss
9                       event data in format YYYY-MM-DD
10                                                  NaN
11                                                  NaN
12         avatar position in x,y Cartesian coordinates
13                                                  NaN
14                            number of edits per trial
15    number of edits per trial that don't change Co...
16     number of edits per trial that decrease CodeDist
17     number of edits per trial that increase C

data dictionary: https://docs.google.com/spreadsheets/d/1ettljIuY46tLbEZXc-H_ktKAatYMLsBYH2KKca-FkXM/edit#gid=0

In [6]:
data.shape

(401940, 33)

In [7]:
data.columns

Index(['index', 'childId', 'date', 'time', 'chapter', 'level', 'session',
       'serialNumber', 'attempt', 'trial', 'pause_s', 'sessionTimestamp',
       'sms', 'eventDescription', 'param', 'avtPos', 'distChange',
       'dist2optMax', 'dist2optBefore', 'dist2optAfter', 'nofCmdsBest',
       'nofCmdsBefore', 'nofCmdsAfter', 'solType', 'repoBefore', 'repoAfter',
       'repoBeforeEdit', 'repoAfterEdit', 'avtxyz', 'pathDist2opt', 'step',
       'inxOptPath', 'stepsOptPath'],
      dtype='object')

In [8]:
# dealing with NaN
data.isnull().sum()

index                    0
childId                  0
date                     0
time                     0
chapter                216
level                  216
session                  0
serialNumber             0
attempt                  0
trial                    0
pause_s                747
sessionTimestamp         0
sms                      0
eventDescription         0
param               264916
avtPos               78574
distChange          299980
dist2optMax         299980
dist2optBefore      299980
dist2optAfter       299980
nofCmdsBest         299980
nofCmdsBefore       299980
nofCmdsAfter        299980
solType             284333
repoBefore          272059
repoAfter           303233
repoBeforeEdit      272059
repoAfterEdit       303233
avtxyz              137242
pathDist2opt        280450
step                280450
inxOptPath          280450
stepsOptPath        280450
dtype: int64

In [9]:
df = data.copy()

In [10]:
df.drop(columns = ['index', 'param', 'avtPos', 'distChange',
       'dist2optMax', 'dist2optBefore', 'dist2optAfter', 'nofCmdsBest',
       'nofCmdsBefore', 'nofCmdsAfter', 'solType', 'repoBefore', 'repoAfter',
       'repoBeforeEdit', 'repoAfterEdit', 'avtxyz', 'pathDist2opt', 'step',
       'inxOptPath', 'stepsOptPath'], inplace = True)

In [11]:
df.head()

Unnamed: 0,childId,date,time,chapter,level,session,serialNumber,attempt,trial,pause_s,sessionTimestamp,sms,eventDescription
0,301,2021-09-24,08:11:13,1.0,1.0,2.0,12,0.0,1.0,,767832.907,0.0,PuzzleIntroDone
1,301,2021-09-24,08:11:13,1.0,1.0,2.0,13,1.0,1.0,0.1,767841.254,0.0,PuzzleStart
2,301,2021-09-24,08:11:17,1.0,1.0,2.0,14,1.0,1.0,3.8,771640.313,3.8,CommandAdded
3,301,2021-09-24,08:11:19,1.0,1.0,2.0,15,1.0,1.0,2.0,773623.663,5.8,StartExecuteTrigger
4,301,2021-09-24,08:11:19,1.0,1.0,2.0,17,1.0,1.0,0.2,773751.507,5.9,ExecuteTriggerCommand


In [None]:
for col in df.columns:
    print('{} : {} unique values'.format(col, len(df[col].unique())))