# About data_connector

This notebook pulls information from different sources of data (log, session) to connect student's data across types.
The end goal is a connection between student id and log files, survey data, and worksheet data.

In [1]:
import os
import numpy as np
import pandas as pd
import getpass
import datetime
import matplotlib
import matplotlib.pyplot as plt
from utils_timeline_viz import find_student_log_file
#utils_read_parsing.py is where all our custorm functions live so we set an autoreload on it.
%load_ext autoreload
%autoreload 1
%aimport utils_read_parsing 
from utils_read_parsing import *
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 20, 7
pd.set_option('display.max_columns', None);pd.set_option('display.max_rows', None);pd.set_option('precision', 2)

# Connect to worksheet data
## beers

In [2]:
%reload_ext utils_read_parsing
worksheets = get_worksheet_metadata('beers')

In [3]:
worksheets.sort_values('Session').head() #Session 13 is first because session 12 was removed from analysis and session 10-11 were coded together

Unnamed: 0,Student ID,Topic,Type,other id,Session,Technical Flags,Comment,use analysis
289,13578154,ABSORBANCE,p,13578154,13,,,True
311,19618321,ABSORBANCE,p,19618321,13,,,True
310,19618321,ABSORBANCE,m,19618321,13,,,True
309,19470169,ABSORBANCE,p,19470169,13,,,True
308,19470169,ABSORBANCE,m,19470169,13,,,True


In [4]:
ids = set(get_students_to_analyze_log())

First we check that all ids in the worksheet metdata have two entries: one for pre and one for post

In [5]:
import collections
ids_worksheets = list(worksheets[worksheets['use analysis']==True]['Student ID'])
ids_other = list(worksheets[worksheets['use analysis']==True]['other id'])
print len(ids_worksheets), len(set(ids_worksheets))
pres = set(worksheets[worksheets['Type']=='p']['Student ID'])
posts = set(worksheets[worksheets['Type']=='m']['Student ID'])
print pres - posts
print posts - pres
print [item for item, count in collections.Counter(ids_worksheets).items() if count != 2]
print "All ids appear twice, one for pre and one for main in Absorbance"

298 149
set([])
set([])
[]
All ids appear twice, one for pre and one for main in Absorbance


Now we check that all ids for which we have logs, we also have worksheet data for (according to the metadata file)

In [6]:
print len(ids&set(ids_worksheets)),"ids in worksheets and analyzable list"
print  len(ids - set(ids_worksheets)),"ids not in worksheets"
print len(set(ids_worksheets)-ids),"ids not in only analyzable ids"
print set(ids_worksheets)-ids #15554169 is the id with the missing pre caps

148 ids in worksheets and analyzable list
0 ids not in worksheets
1 ids not in only analyzable ids
set([15554169L])


In [7]:
meta = get_student_metadata()
print "Ids not in worksheets:"
ids_to_sort = set(ids - set(ids_worksheets))
for idd in ids_to_sort:
    print idd, meta.loc[idd,'session'], idd in list(worksheets['other id'])
print len(ids_to_sort)
print "These worksheets have yet to be coded"

Ids not in worksheets:
0
These worksheets have yet to be coded


In [8]:
print len(set(list(worksheets[worksheets['use analysis']==True]['Student ID']))&set(ids))
print "is the number of students for which we have beers worksheet + logs"

148
is the number of students for which we have beers worksheet + logs


Now we check that we indeed have the pre and post data promised by the worksheet metadata file.

In [9]:
pre = get_pre_worksheet_highest_understanding('beers')
main = get_main_worksheet_highest_understanding('beers')

In [10]:
pre.head()

Unnamed: 0,Student ID,Concentration,Wavelength,Width
0,10127163,1.0,1.0,1.0
1,10232160,1.0,1.0,1.0
2,10375160,1.0,1.0,0.0
3,10375163,0.0,1.0,1.0
4,10420167,1.0,1.0,0.0


In [11]:
for i in list(worksheets[worksheets['Type']=='p']['other id']):
    if i not in list(pre['Student ID']):
        print i, "not in pre"
        
for i in list(worksheets[worksheets['Type']=='m']['other id']):
    if i not in list(main['Student ID']):
        print i, "not in main"

print " If nothing prints then we are golden!"

 If nothing prints then we are golden!


## caps

Now we do the same for caps as we did for beers

In [12]:
%reload_ext utils_read_parsing
worksheets = get_worksheet_metadata('caps')

import collections
ids_worksheets = list(worksheets[worksheets['use analysis']==True]['Student ID'])
ids_other = list(worksheets[worksheets['use analysis']==True]['other id'])
print len(ids_worksheets), len(set(ids_worksheets))
pres = set(worksheets[worksheets['Type']=='p']['Student ID'])
posts = set(worksheets[worksheets['Type']=='m']['Student ID'])
print pres - posts
print posts - pres
# print [item for item, count in collections.Counter(ids_worksheets).items() if count != 2]
# print "All ids appear twice, one for pre and one for main in Capacitance"

312 156
set([])
set([15554169L])


In [13]:
print len(ids&set(ids_worksheets)),"ids in worksheets and analyzable list"
print  len(ids - set(ids_worksheets)),"ids not in worksheets"
print len(set(ids_worksheets)-ids),"ids not in only analyzable ids"

148 ids in worksheets and analyzable list
0 ids not in worksheets
8 ids not in only analyzable ids


In [14]:
meta = get_student_metadata()
print "Ids not in worksheets:"
ids_to_sort = set(ids - set(ids_worksheets))
for idd in ids_to_sort:
    print idd, meta.loc[idd,'session'], idd in list(worksheets['other id'])
print len(ids_to_sort)
print "These worksheets have yet to be coded"

Ids not in worksheets:
0
These worksheets have yet to be coded


In [15]:
print "Ids not in analyzeable logs"
ids_to_sort = set(ids_worksheets)-ids
for idd in ids_to_sort:
    try:
        print idd,meta.loc[idd,'session']
    except:
        print idd, 'Not found'
print "These students were either removed from analysis, have typos in their ids consistent for caps pre and post, or are the two collided students"

Ids not in analyzeable logs
19036162 19036162 Not found
52193156 1
11200165 1
11669161 4
12818156 1
23784336 23784336    3
23784336    3
Name: session, dtype: int64
27630167 1
83459165 1
These students were either removed from analysis, have typos in their ids consistent for caps pre and post, or are the two collided students


In [16]:
pre = get_pre_worksheet_highest_understanding('caps')
main = get_main_worksheet_highest_understanding('caps')

In [17]:
for i in list(worksheets[worksheets['Type']=='p']['other id']):
    if i not in list(pre['Student ID']):
        print i, "not in pre"
        
for i in list(worksheets[worksheets['Type']=='m']['other id']):
    if i not in list(main['Student ID']):
        print i, "not in main"

print " If nothing prints then we are golden!"

 If nothing prints then we are golden!


# Packaging the data for analyis

In [18]:
metadata_L = get_worksheet_metadata('beers')
metadata_C = get_worksheet_metadata('caps')
# worksheets_L = get_worksheet_data_per_sim('beers')
# worksheets_C = get_worksheet_data_per_sim('caps')
pre_L = get_pre_worksheet_highest_understanding('beers')
main_L = get_main_worksheet_highest_understanding('beers')
pre_C = get_pre_worksheet_highest_understanding('caps')
main_C = get_main_worksheet_highest_understanding('caps')

In [19]:
#keep only students we actually want to anlyze
pre_L= pre_L[pre_L['Student ID'].isin(metadata_L[(metadata_L['Type']=='p')&metadata_L['Student ID'].isin(ids)]['other id'].values)]
main_L= main_L[main_L['Student ID'].isin(metadata_L[(metadata_L['Type']=='m')&metadata_L['Student ID'].isin(ids)]['other id'].values)]
pre_C= pre_C[pre_C['Student ID'].isin(metadata_C[(metadata_C['Type']=='p')&metadata_C['Student ID'].isin(ids)]['other id'].values)]
main_C= main_C[main_C['Student ID'].isin(metadata_C[(metadata_C['Type']=='m')&metadata_C['Student ID'].isin(ids)]['other id'].values)]


#ids in pre/post match "other id" in worksheet metadata so we need to assign the correct id in logs for each entry in pre/post
pre_L['sid'] = pre_L['Student ID'].apply(lambda row: metadata_L.loc[metadata_L[(metadata_L['other id']==row)].index[0],'Student ID'])
main_L['sid'] = main_L['Student ID'].apply(lambda row: metadata_L.loc[metadata_L[metadata_L['other id']==row].index[0],'Student ID'])
pre_C['sid'] = pre_C['Student ID'].apply(lambda row: metadata_C.loc[metadata_C[(metadata_C['other id']==row)].index[0],'Student ID'])
main_C['sid'] = main_C['Student ID'].apply(lambda row: metadata_C.loc[metadata_C[metadata_C['other id']==row].index[0],'Student ID'])

In [20]:
print set(pre_L['sid'])==ids
print set(pre_C['sid'])==ids
print set(main_L['sid'])==ids
print set(main_C['sid'])==ids

True
True
True
True


In [21]:
pre_L['sim']='L'
main_L['sim']='L'
pre_C['sim']='C'
main_C['sim']='C'

In [22]:
pre_L_new = pd.melt(pre_L, id_vars=['sid'], value_vars=['Concentration','Wavelength','Width'],value_name='pre_highest')
pre_L_new['sim'] = 'L'
main_L_new = pd.melt(main_L, id_vars=['sid'], value_vars=['Concentration','Wavelength','Width'],value_name='main_highest')
pre_C_new = pd.melt(pre_C, id_vars=['sid'], value_vars=['Area','Separation','Battery voltage'],value_name='pre_highest')
pre_C_new['sim'] = 'C'
main_C_new = pd.melt(main_C, id_vars=['sid'], value_vars=['Area','Separation','Battery voltage'],value_name='main_highest')

In [23]:
main_C_new.head()

Unnamed: 0,sid,variable,main_highest
0,10127163,Area,3
1,10232160,Area,3
2,10537160,Area,0
3,10375163,Area,3
4,10420167,Area,3


In [24]:
pre_L_new.shape

(444, 4)

In [25]:
pre = pd.concat([pre_L_new,pre_C_new],axis=0)
main = pd.concat([main_L_new,main_C_new],axis=0)

In [26]:
worksheet_data = pre.merge(main, on=['sid','variable'], how="inner")

In [27]:
worksheet_data.shape

(888, 5)

In [28]:
worksheet_data.sort_values("sid").head()

Unnamed: 0,sid,variable,pre_highest,sim,main_highest
0,10127163,Concentration,1.0,L,3.0
148,10127163,Wavelength,1.0,L,1.0
296,10127163,Width,1.0,L,3.0
444,10127163,Area,3.0,C,3.0
592,10127163,Separation,3.0,C,3.0


In [30]:
worksheet_data.to_csv(os.path.join(BIG_FOLDER,'all_massaged_data\\worksheets_highest_understanding.txt'), sep='\t', index=False)