In [407]:
import pandas as pd
import numpy as np

In [408]:
data = pd.read_csv("../logs/all.csv",
                  names=["time", "source", "user", "session", "userAgent", "screenWidth", "screenHeight", 
                         "windowWidth", "windowHeight", "resolution", "graph", "position"])

## Cleanup

Remove the first part of the source which is unimportant

In [409]:
data["source"] = data["source"].str.replace("https://jonasoesch.ch/content/work/mortality/", "")

Split source into scenario and story

In [410]:
data["scenario"] = data["source"].str.split("/", expand=True)[0]

In [411]:
data["story"] = data["source"].str.split("/", expand=True)[1].str.split(".", expand=True)[0]

In [412]:
#data = data.drop(["source"], axis=1)

Remove entries where the graph has been undefined (not drawn yet)

In [413]:
data = data[data["graph"] != "undefined"]

## Plausability checks

### When was the first recording by scenario

In [414]:
pd.to_datetime(data.groupby(["scenario"])["time"].min(), unit="ms") 

scenario
juxtaposed-animated   2019-02-04 10:03:03.177
juxtaposed-static     2019-02-04 19:00:43.956
superposed-animated   2019-02-04 18:59:35.878
superposed-static     2019-02-04 18:51:06.671
Name: time, dtype: datetime64[ns]

### When was the latest regording by scenario

In [415]:
pd.to_datetime(data.groupby(["scenario"])["time"].max(), unit="ms") 

scenario
juxtaposed-animated   2019-02-19 12:56:21.761
juxtaposed-static     2019-02-19 10:32:45.696
superposed-animated   2019-02-19 10:30:33.884
superposed-static     2019-02-18 18:58:54.963
Name: time, dtype: datetime64[ns]

### There should only be one user-agent string per user

In [416]:
uaPerUser = data.groupby(["user"])["userAgent"].agg(lambda ua: len(ua.unique()))
uaPerUser[uaPerUser > 1]

user
1549545871217-0.dgx1qfz08iv    2
1549545879946-0.616nkiecbnc    2
1549550007373-0.gy5gegu2zrc    2
Name: userAgent, dtype: int64

### Why are there two user agent strings?

In [417]:
data[data["user"].str.contains("1549545871217-0.dgx1qfz08iv")]["userAgent"].unique()

array(['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'],
      dtype=object)

In [418]:
data[data["user"].str.contains("1549545879946-0.616nkiecbnc")]["userAgent"].unique()

array(['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'],
      dtype=object)

In [419]:
data[data["user"].str.contains("1549550007373-0.gy5gegu2zrc")]["userAgent"].unique()

array(['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'],
      dtype=object)

Answer: a Chrome Update on Windows and on Mac

### There should be only one user per session

In [420]:
usersPerSession = data.groupby("session")["user"].agg(lambda user: len(user.unique()))
usersPerSession[usersPerSession > 1]

Series([], Name: user, dtype: int64)

### The position should always be between 0 and -1

In [421]:
len(data[data["position"] > 1])

0

In [422]:
len(data[data["position"] < -1])

0

### Positions that are smaller than 0 should not be on a regular graph and the inverse

In [423]:
len(data[data["position"] < 0][~data["graph"].str.contains("@")])

  """Entry point for launching an IPython kernel.


0

In [424]:
len(data[data["position"] > 0][data["graph"].str.contains("@")])

  """Entry point for launching an IPython kernel.


0

## Exploration

### How many sessions per scenario?

In [425]:
data.groupby(["scenario"])["session"].agg(lambda session: len(session.unique()))

scenario
juxtaposed-animated    71
juxtaposed-static      15
superposed-animated    31
superposed-static      24
Name: session, dtype: int64

### How many sessions per story?

In [426]:
data.groupby(["story"])["session"].agg(lambda session: len(session.unique()))

story
absolute        33
causes          18
demographics    62
relative        28
Name: session, dtype: int64

## Sessions over time

In [427]:
sessions = data.groupby("session").agg({"session": "first", "time": "min", "scenario": "first"})
sessions["time"] = pd.to_datetime(sessions["time"], unit="ms")
sessions.to_csv("sessions_ts.csv")

See `timedistribution.vl`

## How long are session durations?

In [437]:
durations = data.groupby("session").agg({"session": "first", "time": ["min", "max"], "scenario": "first"})
durations["duration"] = (durations["time", "max"] - durations["time", "min"]) / 1000
durations.to_csv("session_durations.csv")

In [438]:
durations["duration"].min()

0.0

In [439]:
durations["duration"].max() / 60 / 60

47.87285027777777

## How many zero-duration sessions?

In [441]:
len(durations[durations["duration"] == 0])

41

## How many sessions that are longer than 15 minutes?

In [448]:
len(durations[durations["duration"] > 60*15])

6

## How many sessions in between?

In [452]:
durations = durations[durations["duration"] > 0].copy()
durations = durations[durations["duration"] < 60*15].copy()
durations.to_csv("session_durations.csv")
len(durations)

94

So the longest session was almost 48 hours. This happens when you leave tabs open forever.
We find, that there are many 0 duration sessions and that typical sessions are no longer than 3 minutes

See `session_durations.vl`

## Timedelta

In [280]:
minima = positions.groupby("session")["time"].min()
positions["minTime"] = positions['session'].map(minima)
positions["timeDelta"] = (positions["time"] - positions["minTime"]) / 1000 # in seconds

In [281]:
positions.to_csv("distribution.csv")

## Analysis

How many unique visitors did the experiment had?

In [228]:
len(data.groupby(['user']))

33

In [252]:
data["userAgent"].str.split("/", expand=True)[1].unique()

array(['5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko',
       '5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit',
       '5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit',
       '5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko',
       '5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit',
       '5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit',
       '5.0 (Windows NT 10.0; Win64; x64) AppleWebKit',
       '5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko',
       '5.0 (Macintosh; Intel Mac OS X 10.10; rv:65.0) Gecko'],
      dtype=object)

In [229]:
data = data[data["graph"].str.contains("@") == False].copy()

In [230]:
sessions = data.groupby(['session'])

In [231]:
real_sessions = sessions.filter(lambda sess: len(sess) > 5)

Remove all the sessions where the timedelta is more than 10 minutes, they are hard to analyze

In [232]:
brief_sessions = real_sessions.groupby(['session']).filter(lambda sess: sess["time"].max() - sess["time"].min() < 1000*60*3)

Remove all the sessions where the timedelta is shorter than 10 seconds

In [233]:
regular_sessions = real_sessions.groupby(['session']).filter(lambda sess: sess["time"].max() - sess["time"].min() > 1000*10)

Remove everything that is not really a position

In [234]:
#positions = regular_sessions[regular_sessions["graph"].str.contains("@") == False].copy()

Only work on the *demographics* story for now

In [235]:
positions = positions[positions["story"].str.contains("demographics")].copy()

Calculate a new position value based on the order of the graphs on the page

In [236]:
positions.loc[positions["graph"].str.contains("^gender$"), "globalPosition"] = positions["position"] + 0
positions.loc[positions["graph"].str.contains("^gender-highlight$"), "globalPosition"] = positions["position"] + 1
positions.loc[positions["graph"].str.contains("^highlight$"), "globalPosition"] = positions["position"] + 2
positions.loc[positions["graph"].str.contains("^highlight-demographics$"), "globalPosition"] = positions["position"] + 3
positions.loc[positions["graph"].str.contains("^demographics$"), "globalPosition"] = positions["position"] + 4

840.833

In [239]:
positions[positions["timeDelta"] < 30].to_csv("session.csv")

In [240]:
positions.groupby("session")["timeDelta"].max()

session
1549274269236-0.yzc7jfhnfej     12.466
1549274965230-0.ak1aliaur6m    840.833
1549279062844-0.453sw2moigx      1.700
1549306266670-0.7gh6fnk0ows     88.585
1549306541856-0.tza67t8rk19      0.000
1549312331726-0.q3qk6cdwddm     37.446
1549315908440-0.c6ywoi0bqu4     12.816
1549317344349-0.n4zmo71rmbm     14.967
1549380220616-0.cspk0t9rbnp     23.255
1549381866349-0.i3cnhn9nwq      12.119
1549545879947-0.m97m1d0puqe      2.169
1549838491091-0.gzqyh5o0mn6    116.108
1549984548917-0.rwf0vzs5za      22.793
1549984621996-0.9xjmsaz4gvc    508.201
1550247468441-0.85qs18bpqfo     81.527
1550516147178-0.ni3ijna7rhc      1.958
1550572077858-0.rahqnuts569    126.432
1550579654685-0.p6uisi4ae9n      7.136
1550579676249-0.cffax86sm8f     11.269
1550579694356-0.js751b7lf9f     15.338
1550579796764-0.2ftknj4huym      4.282
1550579829913-0.geud6t88m5g      8.504
1550579850298-0.bk78iheeno      39.470
1550579908210-0.v4edgqzgzt      38.459
1550580657480-0.8bzojbn8rv2     13.339
Name: timeDelta, 