In [85]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode
from plotly.subplots import make_subplots

import warnings

import polars as pl

from helpers import *

from IPython.core.display import display, HTML

warnings.filterwarnings("ignore")
display(HTML("<style>.container { width:95% !important; }</style>"))

In [86]:
big_metadata = pl.read_csv("../data/preprocessed_big_metadata.csv")
big_metadata.head()

PARTICIPANT_ID,AGE,GENDER,HAS_TAKEN_TYPING_COURSE,COUNTRY,LAYOUT,NATIVE_LANGUAGE,FINGERS,TIME_SPENT_TYPING,KEYBOARD_TYPE,ERROR_RATE,AVG_WPM_15,AVG_IKI,ECPC,KSPC,ROR,SPEED,AVG_KEYPRESS
i64,i64,str,i64,str,str,str,str,i64,str,f64,f64,f64,f64,f64,f64,str,f64
5,27,"""female""",0,"""MY""","""qwerty""","""en""","""7-8""",6,"""laptop""",0.87108,72.8871,150.457375,0.031469,1.101399,0.3675,"""fast""",102.780952
7,13,"""female""",0,"""AU""","""qwerty""","""en""","""7-8""",0,"""laptop""",6.685633,24.1809,386.575303,0.092105,1.292398,0.0667,"""slow""",122.280861
23,21,"""female""",0,"""IN""","""qwerty""","""en""","""3-4""",0,"""full""",2.130493,24.7112,457.947902,0.016624,1.07289,0.0413,"""slow""",128.350417
24,21,"""female""",0,"""PH""","""qwerty""","""tl""","""7-8""",1,"""laptop""",1.893287,45.3364,223.913395,0.045296,1.1777,0.2678,"""slow""",121.406805
25,19,"""male""",0,"""IN""","""qwerty""","""en""","""7-8""",1,"""laptop""",0.747384,54.6831,190.034172,0.055389,1.146707,0.4434,"""slow""",143.912533


**Things to do:**
- WPM vs Fingers
- WPM vs ROR
- KSPC vs Fingers
- KSPC vs ROR

**High-Performers:**
- Compare IKIs with other people
- Compare RORs with other people
- Compare Keypresses with other people

In [80]:
print(big_metadata.shape)
big_metadata = big_metadata.filter(pl.col("FINGERS") != "10+")
print(big_metadata.shape)

(159156, 18)
(159155, 18)


In [81]:
wpm_fingers = big_metadata.groupby("FINGERS").agg(pl.col("AVG_WPM_15").mean()).sort(by="AVG_WPM_15")
wpm_fingers

FINGERS,AVG_WPM_15
str,f64
"""1-2""",40.459015
"""3-4""",41.334974
"""5-6""",45.98383
"""7-8""",50.28834
"""9-10""",57.74085


### WPM vs Fingers

In [None]:
fig = make_subplots()

trace_1 = go.Bar(x=wpm_fingers["FINGERS"], y=wpm_fingers["AVG_WPM_15"])
trace_2 = go.Scatter(x=wpm_fingers["FINGERS"], y=wpm_fingers["AVG_WPM_15"], line=dict(width=5), marker=dict(size=10))

fig.add_trace(trace_1)
fig.add_trace(trace_2)
fig.update_layout(template="none", width=1600, height=1000, font=dict(size=18), showlegend=False, xaxis_title="Fingers", yaxis_title="Average WPM")
fig.update_yaxes(range=[30, 60])


In [None]:
fig = ff.create_distplot(
    [
        big_metadata.filter(pl.col("FINGERS") == "1-2")["AVG_WPM_15"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "3-4")["AVG_WPM_15"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "5-6")["AVG_WPM_15"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "7-8")["AVG_WPM_15"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "9-10")["AVG_WPM_15"].view().tolist(),
    ], group_labels=["1-2", "3-4", "5-6", "7-8", "9-10"], bin_size=2, show_rug=False,
)

fig.update_layout(template="none", width=1600, height=1000, xaxis=dict(dtick=10), font=dict(size=20), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="Words per Minute", yaxis_title="Density"
                 )

fig.update_xaxes(showgrid=True, range=[0, 170])
fig.show()

In [7]:
fingers = "1-2"

print(big_metadata.filter(pl.col("FINGERS") == fingers)["AVG_WPM_15"].describe())
print("Skewness: ", big_metadata.filter(pl.col("FINGERS") == fingers)["AVG_WPM_15"].skew())
print("Kurtosis: ", big_metadata.filter(pl.col("FINGERS") == fingers)["AVG_WPM_15"].kurtosis())


shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 19810.0   │
│ null_count ┆ 0.0       │
│ mean       ┆ 40.459015 │
│ std        ┆ 18.003714 │
│ min        ┆ 3.9074    │
│ max        ┆ 147.4052  │
│ median     ┆ 36.56795  │
│ 25%        ┆ 27.4059   │
│ 75%        ┆ 50.1659   │
└────────────┴───────────┘
Skewness:  1.0070164370586914
Kurtosis:  0.9547055883808655


### WPM vs ROR

In [None]:
fig = px.scatter(big_metadata.to_pandas(), x="ROR", y="AVG_WPM_15", trendline="ols", trendline_color_override="red")

fig.update_layout(template="none", width=1500, height=1300, font=dict(size=20), xaxis=dict(dtick=0.05), yaxis=dict(dtick=10), showlegend=False,
                 xaxis_title="Rollover Rate", yaxis_title="Words per Minute")
fig.update_xaxes(range=[-0.01, 0.8])

fig.show()

In [None]:
big_metadata.select(pl.pearson_corr("ROR", "AVG_WPM_15"))

### KSPC vs Fingers Used

In [None]:
fig = ff.create_distplot(
    [
        big_metadata.filter(pl.col("FINGERS") == "1-2")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "3-4")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "5-6")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "7-8")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "9-10")["KSPC"].view().tolist(),
    ], group_labels=["1-2", "3-4", "5-6", "7-8", "9-10"], show_rug=False, bin_size=0.01)

fig.update_layout(template="none", width=1600, height=1000, xaxis=dict(dtick=0.05), font=dict(size=20), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="Keystrokes per Character", yaxis_title="Density")

fig.update_xaxes(showgrid=True, range=[1, 1.7])
fig.show()

In [113]:
fingers = "9-10"

print(big_metadata.filter(pl.col("FINGERS") == fingers)["KSPC"].describe())
print("Skewness: ", big_metadata.filter(pl.col("FINGERS") == fingers)["KSPC"].skew())
print("Kurtosis: ", big_metadata.filter(pl.col("FINGERS") == fingers)["KSPC"].kurtosis())


shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ value    │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 76024.0  │
│ null_count ┆ 0.0      │
│ mean       ┆ 1.161901 │
│ std        ┆ 0.08699  │
│ min        ┆ 1.012433 │
│ max        ┆ 2.367868 │
│ median     ┆ 1.143255 │
│ 25%        ┆ 1.101375 │
│ 75%        ┆ 1.201107 │
└────────────┴──────────┘
Skewness:  1.7351412078448063
Kurtosis:  5.9050091490544485


### KSPC vs ROR

In [None]:
fig = px.scatter(big_metadata.to_pandas(), x="ROR", y="KSPC", trendline="ols", trendline_color_override="red")

fig.update_layout(template="none", width=1500, height=1300, font=dict(size=20), yaxis=dict(dtick=0.25), xaxis=dict(dtick=0.05), showlegend=False, 
                 xaxis_title="Rollover Rate", yaxis_title="Keystrokes per Character")

fig.update_yaxes(range=[1, 3])
# fig.update_xaxes(range=[0.9, 3])

fig.show()

In [None]:
big_metadata.select(pl.pearson_corr("ROR", "KSPC"))

In [None]:
fig = ff.create_distplot(
    [
        big_metadata.filter(pl.col("FINGERS") == "1-2")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "3-4")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "5-6")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "7-8")["KSPC"].view().tolist(),
        big_metadata.filter(pl.col("FINGERS") == "9-10")["KSPC"].view().tolist(),
    ], group_labels=["1-2", "3-4", "5-6", "7-8", "9-10"], show_rug=False, bin_size=0.08)

fig.update_layout(template="none", width=1600, height=1300, xaxis=dict(dtick=0.5), font=dict(size=18), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="KSPC", yaxis_title="Density")

fig.update_xaxes(showgrid=True, range=[0.9, 5])
fig.show()

### High-performers

In [125]:
wpm_hp = big_metadata["AVG_WPM_15"].quantile(0.75)
error_hp = big_metadata["ERROR_RATE"].quantile(0.25)

wpm_hp, error_hp

(64.4577, 0.314465408805031)

In [126]:
# high_performers = big_metadata.filter((pl.col("AVG_WPM_15") >= wpm_hp) & (pl.col("ERROR_RATE") <= error_hp))
big_metadata_1 = big_metadata.select([
    pl.col("*"),
    pl.when((pl.col("AVG_WPM_15") >= wpm_hp) & (pl.col("ERROR_RATE") <= error_hp)).then(1).otherwise(0).alias("HIGH_PERFORMER")
])

big_metadata_1["HIGH_PERFORMER"].value_counts()

HIGH_PERFORMER,counts
i32,u32
0,144615
1,14541


### Compare IKIs with other people

In [None]:
fig = ff.create_distplot(
    [
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 0)["AVG_IKI"].view().tolist(),
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 1)["AVG_IKI"].view().tolist()
    ], group_labels=["Not High Performer", "High Performer"], show_rug=False
)

fig.update_layout(template="none", width=1600, height=1000, xaxis=dict(dtick=50), font=dict(size=20), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="Inter-Key Interval Time (ms)", yaxis_title="Density")

fig.update_xaxes(showgrid=True, range=[0, 600])
fig.show()

### Compare RORs with other people

In [None]:
fig = ff.create_distplot(
    [
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 0)["ROR"].view().tolist(),
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 1)["ROR"].view().tolist()
    ], group_labels=["Not High Performer", "High Performer"], show_rug=False, bin_size=0.01
)

fig.update_layout(template="none", width=1600, height=1000, xaxis=dict(dtick=0.05), font=dict(size=20), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="Rollover Rate", yaxis_title="Density")

fig.update_xaxes(showgrid=True, range=[0, 1])
fig.show()

### Compare Keypresses with other people

In [None]:
fig = ff.create_distplot(
    [
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 0)["AVG_KEYPRESS"].view().tolist(),
        big_metadata_1.filter(pl.col("HIGH_PERFORMER") == 1)["AVG_KEYPRESS"].view().tolist()
    ], group_labels=["Not High Performer", "High Performer"], show_rug=False
)

fig.update_layout(template="none", width=1600, height=1000, xaxis=dict(dtick=25), font=dict(size=20), 
                 legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.8,
                        font=dict(size=30)
                    ), xaxis_title="Keypress duration (ms)", yaxis_title="Density")

# fig.update_xaxes(showgrid=True, range=[0, 300])
fig.show()

In [166]:
bm = big_metadata_1.filter(pl.col("AVG_KEYPRESS") < 700)

In [189]:
metric = "AVG_KEYPRESS"
print("High Performers")
print(bm.filter(pl.col("HIGH_PERFORMER") == 1)[metric].describe())
print("Skewness: ", bm.filter(pl.col("HIGH_PERFORMER") == 1)[metric].skew())
print("Kurtosis: ", bm.filter(pl.col("HIGH_PERFORMER") == 1)[metric].kurtosis())

High Performers
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 14541.0    │
│ null_count ┆ 0.0        │
│ mean       ┆ 108.900055 │
│ std        ┆ 19.86985   │
│ min        ┆ 8.616685   │
│ max        ┆ 226.205656 │
│ median     ┆ 106.970109 │
│ 25%        ┆ 95.204513  │
│ 75%        ┆ 120.478836 │
└────────────┴────────────┘
Skewness:  0.6547444095699388
Kurtosis:  1.062872606199865


In [190]:
print("Regular participants")
print(bm.filter(pl.col("HIGH_PERFORMER") == 0)[metric].describe())
print("Skewness: ", bm.filter(pl.col("HIGH_PERFORMER") == 0)[metric].skew())
print("Kurtosis: ", bm.filter(pl.col("HIGH_PERFORMER") == 0)[metric].kurtosis())

Regular participants
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 144607.0   │
│ null_count ┆ 0.0        │
│ mean       ┆ 120.825241 │
│ std        ┆ 31.837429  │
│ min        ┆ 2.481481   │
│ max        ┆ 687.409371 │
│ median     ┆ 116.333333 │
│ 25%        ┆ 100.74295  │
│ 75%        ┆ 134.74359  │
└────────────┴────────────┘
Skewness:  2.920535782585014
Kurtosis:  23.576127433810097
