# About descriptives_of_log_data

This notebook computes some basic descriptive statistics of the log data such as determining the fraction of students that use different sim components (record,table,graph,restore,...)

In [1]:
import os
import numpy as np
import pandas as pd
import getpass
import datetime
import matplotlib
import matplotlib.pyplot as plt
#utils.py is where all our custorm functions live is we set an autoreload on it.
%load_ext autoreload
%autoreload 1
%aimport utils_read_parsing 
from utils_read_parsing import *
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 20, 7
pd.set_option('display.max_columns', None);pd.set_option('display.max_rows', None);pd.set_option('precision', 2)

## Overview
We have log data, survey data, worksheet data and lots of metadata. We have to connect them all! First we must build the right metadata files.
Let's start by connecting log data files to session data, mapping each log data to a student and to a session using dates and times.

### Connecting log data to session
We grab the parsing file reports and the session data

In [2]:
df_beers = get_latest_parsing_report('beers')
df_caps = get_latest_parsing_report('capacitor')
df_sessions = get_session_data()

Captured beers parsing report that was parsed on 2017-10-20_09.45.26
Captured capacitor parsing report that was parsed on 2017-10-20_10.06.46


In [3]:
df_sessions.head()

Unnamed: 0,session number,date,number of students from sign out sheet,start time,end time,wave,ran study,supported
0,1,2017-01-13,5,1,3,1,Jon,Sarah
1,2,2017-01-17,14,11,1,1,Jon,Sarah
2,3,2017-01-17,18,3,5,1,Sarah,Jon
3,4,2017-01-19,5,9,11,1,Sarah,Jon
4,5,2017-01-19,18,1,3,1,Jon,Sarah


In [4]:
ids_exclude = ['12345678','11111111','student1','99999999']

In [5]:
#build me later
sessions_times = {('2017-01-13_13.00.00','2017-01-13_15.00.00'):1,('2017-01-17_11.00.00','2017-01-17_13.00.00'):2}

In [6]:
def convert_from_unix_time(t):
    ''' Take a unix time stamp in milliseconds and convert to date and time'''
    return datetime.datetime.fromtimestamp(int(t)/1000.0).strftime('%Y-%m-%d_%H.%M.%S')

def convert_to_unix_time(date):
    ''' Take a unix time stamp in milliseconds and convert to date and time'''
    return datetime.datetime.strptime(date, '%Y-%m-%d_%H.%M.%S')

def extrapolate_session(row):
    session = 'NA'
    date = row['date']
    timestamp = convert_to_unix_time(date)
    for period,session_number in sessions_times.iteritems():
        print period, date
        t1,t2 = convert_to_unix_time(period[0]),convert_to_unix_time(period[1])
        if timestamp > t1 and timestamp < t2:
            session = session_number   
    return session

df_beers['Session'] = df_beers.apply(extrapolate_session, axis=1)

('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-03-22_11.41.17
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-03-22_11.41.17
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-27_13.26.59
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-01-27_13.26.59
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-20_16.22.58
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-01-20_16.22.58
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-20_15.24.45
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-01-20_15.24.45
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-17_16.21.15
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-01-17_16.21.15
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-20_16.23.11
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-01-20_16.23.11
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-03-21_18.25.09
('2017-01-13_13.00.00', '2017-01-13_15.00.00') 2017-03-21_18.25.09
('2017-01-17_11.00.00', '2017-01-17_13.00.00') 2017-01-19_14.2

In [7]:
df_beers['Session']

0      NA
1      NA
2      NA
3      NA
4      NA
5      NA
6      NA
7      NA
8      NA
9      NA
10     NA
11     NA
12     NA
13     NA
14     NA
15     NA
16     NA
17     NA
18     NA
19     NA
20      1
21     NA
22     NA
23     NA
24      2
25     NA
26     NA
27     NA
28     NA
29      2
30     NA
31     NA
32     NA
33     NA
34     NA
35     NA
36     NA
37     NA
38     NA
39     NA
40     NA
41     NA
42     NA
43     NA
44     NA
45     NA
46     NA
47     NA
48     NA
49     NA
50     NA
51     NA
52     NA
53     NA
54     NA
55     NA
56     NA
57     NA
58     NA
59     NA
60     NA
61     NA
62     NA
63     NA
64     NA
65     NA
66     NA
67     NA
68     NA
69     NA
70     NA
71     NA
72     NA
73     NA
74     NA
75     NA
76     NA
77     NA
78     NA
79     NA
80     NA
81     NA
82     NA
83     NA
84     NA
85     NA
86     NA
87     NA
88     NA
89     NA
90     NA
91     NA
92     NA
93     NA
94     NA
95     NA
96     NA
97     NA
98     NA
99     NA
