In [1]:
import pandas as pd
from pdvega import Axes as Vega

# Data preparation

This file reads all the CSV-files created in the experiment into a DataFrame. It then extracts different parameters like the experiment condition and the story from the URL.

## Parameters

In [2]:
startOfStudy = "2019-07-11"
endOfStudy = "2019-07-20"
maxDuration = 600 # 15 minutes

files = ['1562865674-1562871596.part.csv', '1562871601-1562878799.part.csv', '1562878800-1562884571.part.csv', 
        '1562890521-1562896081.part.csv', '1562897083-1562903990.part.csv', '1562904000-1562905975.part.csv',
        '1562918465-1562923724.part.csv', '1562947240-1562962166.part.csv', '1563039873-1563045975.part.csv',
        '1563045977-1563048933.part.csv', '1563048938-1563057333.part.csv', '1563057333-1563177320.part.csv',
        '1563177325-1563181935.part.csv', '1563181935-1563185057.part.csv', '1563185058-1563202255.part.csv', 
        '1563207603-1563215481.part.csv', '1563215481-1563222480.part.csv', '1563222483-1563238239.part.csv',
        '1563238249-1563264410.part.csv', '1563264415-1563274810.part.csv', '1563274815-1563285261.part.csv',
        '1563285262-1563287989.part.csv', '1563287993-1563292393.part.csv', '1563292393-1563305295.part.csv',
        '1563305295-1563313907.part.csv', '1563313907-1563321409.part.csv', '1563321411-1563331840.part.csv',
        '1563331845-1563334703.part.csv']

path_to_data = "../data/"

## Data preparation

Load the data, parse dates and drop duplicates:

In [4]:
df_from_each_file = (pd.read_csv(path_to_data+f,
                         names=["time", "source", "user", "session", "userAgent", "screenWidth", "screenHeight", 
                         "windowWidth", "windowHeight", "resolution", "chart", "relativePosition", 
                         "absolutePosition", "message"]
                                ) for f in files)
all = pd.concat(df_from_each_file, ignore_index=True)
all.time = pd.to_datetime(all.time, unit="ms")
all.drop_duplicates(inplace=True)
all = all.query("user != 'Jonas'").copy()
all.head()

Unnamed: 0,time,source,user,session,userAgent,screenWidth,screenHeight,windowWidth,windowHeight,resolution,chart,relativePosition,absolutePosition,message
0,2019-07-11 17:21:14.704,https://www.cs.technik.fhnw.ch/lostintransitio...,1562865674703-0.3xhck28drqb,1562865674704-0.46qaerk672y,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; ...,1920,1200,1477,1103,2.0,@init,-1.0,-1,
2,2019-07-11 17:21:14.706,https://www.cs.technik.fhnw.ch/lostintransitio...,1562865674703-0.3xhck28drqb,1562865674704-0.46qaerk672y,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; ...,1920,1200,1477,1103,2.0,@message: welcome,-1.0,-1,
3,2019-07-11 17:21:56.524,https://www.cs.technik.fhnw.ch/lostintransitio...,1562865716523-0.gy7s40qv4kk,1562865716524-0.9588yccch9p,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2560,1440,1427,906,1.0,@init,-1.0,-1,
5,2019-07-11 17:21:56.527,https://www.cs.technik.fhnw.ch/lostintransitio...,1562865716523-0.gy7s40qv4kk,1562865716524-0.9588yccch9p,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2560,1440,1427,906,1.0,@message: welcome,-1.0,-1,
6,2019-07-11 17:22:09.194,https://www.cs.technik.fhnw.ch/lostintransitio...,1562865729192-0.a9jx3nl92o,1562865729193-0.b85m3ku7ze,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,1280,720,1280,648,1.5,@init,-1.0,-1,


## Get different variables from the URL:

* layout
    * superposed
    * juxtaposed
* transition
    * static
    * animated
* story
    * mortality
    * energy
* substory
    * 0
    * 1
    * 2
    * 3
    * 4

In [5]:
mappings = [
    # Mortality superposed static
    {
        "url": "mortality/SS/demographics",
        "transition": "static",
        "layout": "superposed",
        "story": "mortality",
        "substory": "1"
    },
    {
        "url": "mortality/SS/absolute",
        "transition": "static",
        "layout": "superposed",
        "story": "mortality",
        "substory": "2"
    },
    {
        "url": "mortality/SS/relative",
        "transition": "static",
        "layout": "superposed",
        "story": "mortality",
        "substory": "3"
    },
    {
        "url": "mortality/SS/causes",
        "transition": "static",
        "layout": "superposed",
        "story": "mortality",
        "substory": "4"
    },
    # Mortality superposed animated
    {
        "url": "mortality/SA/demographics",
        "transition": "animated",
        "layout": "superposed",
        "story": "mortality",
        "substory": "1"
    },
    {
        "url": "mortality/SA/absolute",
        "transition": "animated",
        "layout": "superposed",
        "story": "mortality",
        "substory": "2"
    },
    {
        "url": "mortality/SA/relative",
        "transition": "animated",
        "layout": "superposed",
        "story": "mortality",
        "substory": "3"
    },
    {
        "url": "mortality/SA/causes",
        "transition": "animated",
        "layout": "superposed",
        "story": "mortality",
        "substory": "4"
    },
    # Mortality juxtaposed static
    {
        "url": "mortality/JS/demographics",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "1"
    },
    {
        "url": "mortality/JS/absolute",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "2"
    },
    {
        "url": "mortality/JS/relative",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "3"
    },
    {
        "url": "mortality/JS/causes",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "4"
    },
    ## Mortality juxtaposed animated
        {
        "url": "mortality/JA/demographics",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "1"
    },
    {
        "url": "mortality/JA/absolute",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "2"
    },
    {
        "url": "mortality/JA/relative",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "3"
    },
    {
        "url": "mortality/JA/causes",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "mortality",
        "substory": "4"
    },
    # Energy superposed static
    {
        "url": "energy/SSA",
        "transition": "static",
        "layout": "superposed",
        "story": "energy",
        "substory": "1"
    },
    {
        "url": "energy/SSB",
        "transition": "static",
        "layout": "superposed",
        "story": "energy",
        "substory": "2"
    },
    {
        "url": "energy/SSC",
        "transition": "static",
        "layout": "superposed",
        "story": "energy",
        "substory": "3"
    },
    {
        "url": "energy/SSD",
        "transition": "static",
        "layout": "superposed",
        "story": "energy",
        "substory": "4"
    },
    # Energy superposed animated
    {
        "url": "energy/SAA",
        "transition": "animated",
        "layout": "superposed",
        "story": "energy",
        "substory": "1"
    },
    {
        "url": "energy/SAB",
        "transition": "animated",
        "layout": "superposed",
        "story": "energy",
        "substory": "2"
    },
    {
        "url": "energy/SAC",
        "transition": "animated",
        "layout": "superposed",
        "story": "energy",
        "substory": "3"
    },
    {
        "url": "energy/SAD",
        "transition": "animated",
        "layout": "superposed",
        "story": "energy",
        "substory": "4"
    },
    # Energy juxtaposed static
    {
        "url": "energy/JSA",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "1"
    },
    {
        "url": "energy/JSB",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "2"
    },
    {
        "url": "energy/JSC",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "3"
    },
    {
        "url": "energy/JSD",
        "transition": "static",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "4"
    },
    # Energy juxtaposed animated
    {
        "url": "energy/JAA",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "1"
    },
    {
        "url": "energy/JAB",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "2"
    },
    {
        "url": "energy/JAC",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "3"
    },
    {
        "url": "energy/JAD",
        "transition": "animated",
        "layout": "juxtaposed",
        "story": "energy",
        "substory": "4"
    },
    
    {
        "url": "mortality/SS/initial.html",
        "transition": None,
        "layout": None,
        "story": "mortality",
        "substory": "0"
    },
    {
        "url": "energy/initial",
        "transition": None,
        "layout": None,
        "story": "energy",
        "substory": "0"
    },
    
    # Envelope
    {
        "url": "envelope/start",
        "transition": None,
        "layout": None,
        "story": "envelope",
        "substory": "start"
    },
    {
        "url": "envelope/survey",
        "transition": None,
        "layout": None,
        "story": "envelope",
        "substory": "survey"
    },
    {
        "url": "envelope/end",
        "transition": None,
        "layout": None,
        "story": "envelope",
        "substory": "end"
    },
]

In [6]:
print(pd.DataFrame(mappings).groupby("story").url.count())
print("-----------------------------------")
print(pd.DataFrame(mappings).groupby("substory").url.count())
print("-----------------------------------")
print(pd.DataFrame(mappings).groupby("transition").url.count())
print("-----------------------------------")
print(pd.DataFrame(mappings).groupby("layout").url.count())

story
energy       17
envelope      3
mortality    17
Name: url, dtype: int64
-----------------------------------
substory
0         2
1         8
2         8
3         8
4         8
end       1
start     1
survey    1
Name: url, dtype: int64
-----------------------------------
transition
animated    16
static      16
Name: url, dtype: int64
-----------------------------------
layout
juxtaposed    16
superposed    16
Name: url, dtype: int64


### Map story

In [7]:
for e in mappings:
    all.loc[all.source.str.contains(e["url"]), "story"] = e["story"]
all.query("story.isna()")

Unnamed: 0,time,source,user,session,userAgent,screenWidth,screenHeight,windowWidth,windowHeight,resolution,chart,relativePosition,absolutePosition,message,story


### Map substory

In [8]:
for e in mappings:
    all.loc[all.source.str.contains(e["url"]), "substory"] = e["substory"]
all.query("substory.isna()")

Unnamed: 0,time,source,user,session,userAgent,screenWidth,screenHeight,windowWidth,windowHeight,resolution,chart,relativePosition,absolutePosition,message,story,substory


### Map layout

In [9]:
for e in mappings:
    all.loc[all.source.str.contains(e["url"]), "layout"] = e["layout"]
all.query("layout.isna()").substory.unique()

array(['start', '0', 'survey', 'end'], dtype=object)

### Map transition

In [10]:
for e in mappings:
    all.loc[all.source.str.contains(e["url"]), "transition"] = e["transition"]
all.query("transition.isna()").substory.unique()

array(['start', '0', 'survey', 'end'], dtype=object)

### Map condition

In [11]:
all["condition"] = all.source.str.extract(r"flow=([A-Za-z0-9]+)")[0]

## Save result

In [12]:
all.to_pickle("outputs/201907110900_data_preparation.pkl", compression="gzip")