This notebook organizes all the results in the "analysis" folder within a concrete storyline

In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

# This line will hide code by default when the notebook is converted to HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

In [4]:
from utils_read_parsing import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 7, 4
pd.set_option('precision',3)
pd.set_option("display.width", 100)
np.set_printoptions(precision=3,suppress=True)

# Preparing the data
We wish to have a table that looks like:

In [5]:
t = [['sid','variable','has_pre_id', 'has_pre_qual', 'has_pre_quant', 'has_main_id', 'has_main_qual', 'has_main_quant']]
t.append(['12345678','Width',1,0,0,0,1,0])
t.append(['12345678','Concentration',0,0,1,0,0,1])
print tabulate(t)
t = [['...','pre_id_corr', 'pre_qual_corr', 'pre_quant_corr', 'main_id_corr', 'main_qual_corr', 'main_quant_corr','index','sim','CVS_context']]
t.append(['...',1,'NA','NA','NA',0,'NA',2,'L','table'])
t.append(['...','NA',0,'NA','NA','NA',0,2,'L','graph'])
print tabulate(t)
t = [['...','tansfer_qual','tansfer_quant','student attributes']]
t.append(['...',1,1,'...'])
t.append(['...',1,1,'...'])
print tabulate(t)


--------  -------------  ----------  ------------  -------------  -----------  -------------  --------------
sid       variable       has_pre_id  has_pre_qual  has_pre_quant  has_main_id  has_main_qual  has_main_quant
12345678  Width          1           0             0              0            1              0
12345678  Concentration  0           0             1              0            0              1
--------  -------------  ----------  ------------  -------------  -----------  -------------  --------------
---  -----------  -------------  --------------  ------------  --------------  ---------------  -----  ---  -----------
...  pre_id_corr  pre_qual_corr  pre_quant_corr  main_id_corr  main_qual_corr  main_quant_corr  index  sim  CVS_context
...  1            NA             NA              NA            0               NA               2      L    table
...  NA           0              NA              NA            NA              0                2      L    graph
---  --------

## We load all student data files

In [7]:
pre_survey_df = get_massaged_pre_survey()
post_survey_df = get_massaged_post_survey()
meta_df = get_student_metadata()
meta_worksheets_L_df = get_worksheet_metadata('beers')
meta_worksheets_L_df = get_worksheet_metadata('caps')
#     table_cvs_df = pd.read_csv('table_cvs_results.txt', sep='\t')
#     graph_cvs_df = pd.read_csv('graph_cvs_results.txt', sep='\t')

## Grabing the CVS data...

In [None]:
order = dict(zip(meta_df.index,meta_df['activity order']))
graph_cvs_df['activity order'] = graph_cvs_df.studentid.apply(lambda sid: order[sid])
table_cvs_df['activity order'] = table_cvs_df.studentid.apply(lambda sid: order[sid])

In [None]:
variables = ["Area","Separation","Width","Concentration","Wavelength","Battery voltage"]

def binarize(threshold,number):
    if number>= threshold : return 1
    else: return 0
    
# Given that we want to compare the stringency of CVS, we created different definitions with 2,3,4 points needed

graph_cvs_2_df = graph_cvs_df.copy()
graph_cvs_3_df = graph_cvs_df.copy()
graph_cvs_4_df = graph_cvs_df.copy()
graph_cvs_5_df = graph_cvs_df.copy()
for v in variables:
    graph_cvs_2_df[v] = graph_cvs_df.apply(lambda row: binarize(2,row[v]), axis=1)
    graph_cvs_3_df[v] = graph_cvs_df.apply(lambda row: binarize(3,row[v]), axis=1)
    graph_cvs_4_df[v] = graph_cvs_df.apply(lambda row: binarize(4,row[v]), axis=1)
    graph_cvs_5_df[v] = graph_cvs_df.apply(lambda row: binarize(5,row[v]), axis=1)

table_cvs_2_df = table_cvs_df.copy()
table_cvs_3_df = table_cvs_df.copy()
table_cvs_4_df = table_cvs_df.copy()
table_cvs_5_df = table_cvs_df.copy()
for v in variables:
    table_cvs_2_df[v] = table_cvs_df.apply(lambda row: binarize(2,row[v]), axis=1)
    table_cvs_3_df[v] = table_cvs_df.apply(lambda row: binarize(3,row[v]), axis=1)
    table_cvs_4_df[v] = table_cvs_df.apply(lambda row: binarize(4,row[v]), axis=1)
    table_cvs_5_df[v] = table_cvs_df.apply(lambda row: binarize(5,row[v]), axis=1)

In [None]:
# table_intervals_df = pd.read_csv('table_intervals_results.txt', sep='\t')

## Grabbing the worksheet data

In [None]:
%reload_ext utils_read_parsing
#grab worksheet data
worksheets = get_worksheet_metadata('beers')
pre = get_pre_worksheet(sim='beers')
main = get_main_worksheet(sim='beers')

ids = get_students_to_analyze_log_worksheets('beers')
pre= pre[pre['Student ID'].isin(worksheets[(worksheets['Type']=='p')&worksheets['Student ID'].isin(ids)]['other id'].values)]
main= main[main['Student ID'].isin(worksheets[(worksheets['Type']=='m')&worksheets['Student ID'].isin(ids)]['other id'].values)]

#ids in pre/post match "other id" in worksheet metdata so we need to assign the correct id in logs for each entry in pre/post
pre['sid'] = pre['Student ID'].apply(lambda row: worksheets.loc[worksheets[(worksheets['other id']==row)].index[0],'Student ID'])
main['sid'] = main['Student ID'].apply(lambda row: worksheets.loc[worksheets[worksheets['other id']==row].index[0],'Student ID'])

# print len(ids),len(pre),len(main)

In [None]:
## make desired table for beers
melted_pre = pd.melt(pre, id_vars=['sid'], value_vars=['Concentration','Width'], var_name='variable',value_name='pre')
melted_main = pd.melt(main, id_vars=['sid'], value_vars=['Concentration','Width'], var_name='variable',value_name='main')

L_scores = melted_pre.merge(melted_main, on=['sid','variable'], how='outer')
L_scores['sim'] = 'L'

L_scores['CVS_table_2'] = L_scores.apply(lambda row: table_cvs_2_df.loc[table_cvs_2_df[table_cvs_2_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
L_scores['CVS_graph_2'] = L_scores.apply(lambda row: graph_cvs_2_df.loc[graph_cvs_2_df[graph_cvs_2_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

L_scores['CVS_table_3'] = L_scores.apply(lambda row: table_cvs_3_df.loc[table_cvs_3_df[table_cvs_3_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
L_scores['CVS_graph_3'] = L_scores.apply(lambda row: graph_cvs_3_df.loc[graph_cvs_3_df[graph_cvs_3_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

L_scores['CVS_table_4'] = L_scores.apply(lambda row: table_cvs_4_df.loc[table_cvs_4_df[table_cvs_4_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
L_scores['CVS_graph_4'] = L_scores.apply(lambda row: graph_cvs_4_df.loc[graph_cvs_4_df[graph_cvs_4_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

L_scores['CVS_table_5'] = L_scores.apply(lambda row: table_cvs_5_df.loc[table_cvs_5_df[table_cvs_5_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
L_scores['CVS_graph_5'] = L_scores.apply(lambda row: graph_cvs_5_df.loc[graph_cvs_5_df[graph_cvs_5_df['studentid']==row['sid']].index[0],row['variable']],axis=1)


L_scores['number_points_table'] = L_scores.apply(lambda row: table_cvs_df.loc[table_cvs_df[table_cvs_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
L_scores['number_points_graph'] = L_scores.apply(lambda row: graph_cvs_df.loc[graph_cvs_df[graph_cvs_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

L_scores['sim_index'] = L_scores.apply(lambda row: table_cvs_2_df.loc[table_cvs_2_df[table_cvs_2_df['studentid']==row['sid']].index[0],'activity order'].index(row['sim'])+1,axis=1)

In [None]:
# L_scores[L_scores['variable']=='Width'].describe()

In [None]:
%reload_ext utils_read_parsing
#grab worksheet data
worksheets = get_worksheet_metadata('caps')
pre = get_pre_worksheet(sim='caps')
main = get_main_worksheet(sim='caps')

ids = get_students_to_analyze_log_worksheets('caps')
pre= pre[pre['Student ID'].isin(worksheets[(worksheets['Type']=='p')&worksheets['Student ID'].isin(ids)]['other id'].values)]
main= main[main['Student ID'].isin(worksheets[(worksheets['Type']=='m')&worksheets['Student ID'].isin(ids)]['other id'].values)]

#ids in pre/post match "other id" in worksheet metdata so we need to assign the correct id in logs for each entry in pre/post
pre['sid'] = pre['Student ID'].apply(lambda row: worksheets.loc[worksheets[(worksheets['other id']==row)].index[0],'Student ID'])
main['sid'] = main['Student ID'].apply(lambda row: worksheets.loc[worksheets[worksheets['other id']==row].index[0],'Student ID'])

# print len(ids),len(pre),len(main)

In [None]:
## make desired table for caps
melted_pre = pd.melt(pre, id_vars=['sid'], value_vars=['Area','Separation'], var_name='variable',value_name='pre')
melted_main = pd.melt(main, id_vars=['sid'], value_vars=['Area','Separation'], var_name='variable',value_name='main')

C_scores = melted_pre.merge(melted_main, on=['sid','variable'], how='outer')
C_scores['sim'] = 'C'
C_scores['CVS_table_2'] = C_scores.apply(lambda row: table_cvs_2_df.loc[table_cvs_2_df[table_cvs_2_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
C_scores['CVS_graph_2'] = C_scores.apply(lambda row: graph_cvs_2_df.loc[graph_cvs_2_df[graph_cvs_2_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

C_scores['CVS_table_3'] = C_scores.apply(lambda row: table_cvs_3_df.loc[table_cvs_3_df[table_cvs_3_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
C_scores['CVS_graph_3'] = C_scores.apply(lambda row: graph_cvs_3_df.loc[graph_cvs_3_df[graph_cvs_3_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

C_scores['CVS_table_4'] = C_scores.apply(lambda row: table_cvs_4_df.loc[table_cvs_4_df[table_cvs_4_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
C_scores['CVS_graph_4'] = C_scores.apply(lambda row: graph_cvs_4_df.loc[graph_cvs_4_df[graph_cvs_4_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

C_scores['CVS_table_5'] = C_scores.apply(lambda row: table_cvs_5_df.loc[table_cvs_5_df[table_cvs_5_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
C_scores['CVS_graph_5'] = C_scores.apply(lambda row: graph_cvs_5_df.loc[graph_cvs_5_df[graph_cvs_5_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

C_scores['number_points_table'] = C_scores.apply(lambda row: table_cvs_df.loc[table_cvs_df[table_cvs_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
C_scores['number_points_graph'] = C_scores.apply(lambda row: graph_cvs_df.loc[graph_cvs_df[graph_cvs_df['studentid']==row['sid']].index[0],row['variable']],axis=1)

C_scores['sim_index'] = C_scores.apply(lambda row: table_cvs_2_df.loc[table_cvs_2_df[table_cvs_2_df['studentid']==row['sid']].index[0],'activity order'].index(row['sim'])+1,axis=1)

## Putting it all together

In [None]:
data = pd.concat([L_scores,C_scores])
# data['intervals_in_table'] = data.apply(lambda row: table_intervals_df.loc[table_intervals_df[table_intervals_df['studentid']==row['sid']].index[0],row['variable']],axis=1)
# data['CVS_table_only'] = data.apply(lambda row: row['CVS_table']*(1-row['CVS_graph']), axis = 1)
data = data.reset_index(drop=True)

## Adding wrapper use data

In [None]:
use_wrapper_df = pd.read_csv('use_wrapper_results.txt', sep='\t')
data['use_table'] = data.apply(lambda row: use_wrapper_df.loc[use_wrapper_df[use_wrapper_df['studentid']==row['sid']].index[0],'use_table'], axis=1)
data['use_graph'] = data.apply(lambda row: use_wrapper_df.loc[use_wrapper_df[use_wrapper_df['studentid']==row['sid']].index[0],'use_graph'], axis=1)
data[['use_table','use_graph']].describe()

Everyone uses the table once and 85% use the graph once. We may want to do the analysis only with students who used the graph

## Adding pre-survey data

In [None]:
pre_survey_df.columns

In [None]:
data['level_experience_sims'] = data.apply(lambda row: pre_survey_df.loc[pre_survey_df[pre_survey_df['sid']==row['sid']].index[0],'prior_number_virtual_labs'], axis=1)
# data['experience_undergrad_labs'] = data.apply(lambda row: pre_survey_df.loc[pre_survey_df[pre_survey_df['sid']==row['sid']].index[0],'experience_undergrad_labs'], axis=1)
data['used_similar_sim_L'] = data[data['sim']=='L'].apply(lambda row: pre_survey_df.loc[pre_survey_df[pre_survey_df['sid']==row['sid']].index[0],'similar_L'],axis=1)
data['used_similar_sim_L'].fillna(0, inplace=True)
data['used_similar_sim_C'] = data[data['sim']=='C'].apply(lambda row: pre_survey_df.loc[pre_survey_df[pre_survey_df['sid']==row['sid']].index[0],'similar_C'],axis=1)
data['used_similar_sim_C'].fillna(0, inplace=True)
data['used_similar_sim'] = data['used_similar_sim_L'] + data['used_similar_sim_C']

We add a "CVS_levels" column to our data (graph=2, table=1, none=0)

In [None]:
data['CVS_levels_2']=0
data['CVS_levels_3']=0
data['CVS_levels_4']=0
data['CVS_levels_5']=0
# data['CVS_table_only'] = data.apply(lambda row: row['CVS_table']*(1-row['CVS_graph']), axis = 1)
data['CVS_levels_2']=data['CVS_table_2']+data['CVS_graph_2']
data['CVS_levels_3']=data['CVS_table_3']+data['CVS_graph_3']
data['CVS_levels_4']=data['CVS_table_4']+data['CVS_graph_4']
data['CVS_levels_5']=data['CVS_table_5']+data['CVS_graph_5']
print data[(data['CVS_table_2']==0)&(data['CVS_graph_2']==1)].shape
print data[(data['CVS_table_3']==0)&(data['CVS_graph_3']==1)].shape
print data[(data['CVS_table_4']==0)&(data['CVS_graph_4']==1)].shape
print data[(data['CVS_table_5']==0)&(data['CVS_graph_5']==1)].shape

In [None]:
data.sort_values('sid').head(12)

In [None]:
data.hist()

## Exporting the data

In [None]:
# export_data = data[["sid","variable","pre","main","sim","CVS_graph","sim_index"]]
export_data = data.copy()
export_data.to_csv('dataframe_all_factors_by_student_x_variable.csv')

## Stats tools and multicollinearity check

In [None]:
def eta_squared(aov):
    aov['eta_sq'] = 'NaN'
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    return aov
 
def omega_squared(aov):
    mse = aov['sum_sq'][-1]/aov['df'][-1]
    aov['omega_sq'] = 'NaN'
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*mse))/(sum(aov['sum_sq'])+mse)
    return aov

def clean_summary(model):
    m = model.summary()
    lines = m.as_text().split('\n')
    lines = [l for l in lines if 'C(sid)' not in l]
    return '\n'.join(lines)

# Results to present in methods
## Student population

In [None]:
N = len(set(data['sid']))
print "The study includes {0} students".format(N)

In [None]:
posts = get_all_posts_surveys()
genders = ['gender','[gender] To which gender do you most identify? [Man]','[gender] To which gender do you most identify? [Gender non conforming/non-binary]','[gender] To which gender do you most identify? [Trans*]','[gender] To which gender do you most identify? [Rather specify:]','[gender] To which gender do you most identify? [Rather specify:] [text]','[gender] To which gender do you most identify? [Prefer not to answer]']
for g in genders:
    if '?' in g:
        gender = g.split('?')[1]
    else:
        gender = ' [Woman]'
    print gender, len(set(posts[posts[g]==1]['sid'])), round(len(set(posts[posts[g]==1]['sid']))/float(N)*100,1)

## Student prior experience

In [None]:
pd.pivot_table(data, values=['level_experience_sims'], index=['sid'],aggfunc=sum)[['level_experience_sims']].hist()
print '''where:
 0 -> None
 1 -> 1-2 (roughly)
 2 -> 3-5 (roughly)
 3 -> 6+ (roughly)'''

In [None]:
# print  'experience in virtual labs teaching light absorbance or capacitors before'
print data[['used_similar_sim_C','used_similar_sim_L']].describe()
pd.pivot_table(data, values=['used_similar_sim_C','used_similar_sim_L'], index=['sid'],aggfunc=np.sum)[['used_similar_sim_C','used_similar_sim_L']].hist(sharey=True)

In [None]:
pd.pivot_table(data, values=['sid'], index=['sim_index','variable'],aggfunc=len)

# Learning through inquiry
## Overall

Let's look at pre to post to see if students learn.

-> repeat stacked bar chart from worksheet data connector


### Descriptives
\# of statments per type if student CSV
(pivot table)

In [None]:
d = pd.pivot_table(data, values=['pre','main'], index=['variable','CVS_levels_3'],aggfunc=(len,np.mean, np.std))
# e= pd.pivot_table(data, values=['pre','main'], index=['variable','CVS_graph_3'],aggfunc=(np.mean, np.std))
d

# Does using strategy help student make model?

## Using different levels of CVS and model statements

We have 10 independant variables:
* sim_index
* sim
* variable
* level_experience_sims
* experience_undergrad_labs
* used_similar_sim
* CVS_context
* has_pre_id
* has_pre_qual
* has_pre_quant

And 3 dependant:
* has_main_id
* has_main_qual
* has_main_quant


*NOTE:* we don't include variable and sim in the same model

## Using different levels of CVS and model correctness

We have 8 control variables:
* sim_index
* sim
* variable
* level_experience_sims
* experience_undergrad_labs
* used_similar_sim
* CVS_context
* pre_id_corr
* pre_qual_corr
* pre_quant_corr

and 3 dependant:
* main_id_corr
* main_qual_corr
* main_quant_corr


*NOTE:* we don't include variable and sim in the same model

We first remove students with max pre for a certain variable

In [None]:
print data.shape
filtered_maxpre_data = data.copy()
filtered_maxpre_data = filtered_maxpre_data[filtered_maxpre_data['pre']<3]
#remove student who didn't do CVS-explore for 2 variables
print filtered_maxpre_data.shape
# print filtered_maxpre_data.describe()
filtered_maxpre_data = filtered_maxpre_data[(filtered_maxpre_data['sid']!= 11384795)|(filtered_maxpre_data['variable'].isin(['Width','Concentration']))]
# print filtered_maxpre_data[filtered_maxpre_data['sid']==11384795]
# filtered_maxpre_data = filtered_maxpre_data[(filtered_maxpre_data['sid']!= 11384795)&(filtered_maxpre_data['variable']=='Separation')]
print filtered_maxpre_data.shape
# print filtered_maxpre_data.describe()

In [None]:
# covariates_for_interaction_model = " + sim_index + C(sid) + level_experience_sims + experience_undergrad_labs + used_similar_sim"
covariates_for_parsimonious_model = "+ variable + pre + sim_index + C(sid) + level_experience_sims + used_similar_sim"

In [None]:
formula = 'main ~ C(CVS_levels_3) '+ covariates_for_parsimonious_model
model = ols(formula, filtered_maxpre_data).fit()
print "\n\nModel: ",formula
aov_table = anova_lm(model, typ=3)
eta_squared(aov_table)
omega_squared(aov_table)
print "\nAnova table using type 3\n"
print(aov_table)
print "\nHere is the linear model with coefficients and confidence intervals (removed stats for individual student ids):\n"
print clean_summary(model)

In [None]:
# aov_table = aov_table.round({'F': 2,'eta_sq':2})
# print tabulate(aov_table[['F','PR(>F)','eta_sq']].sort_values('F',ascending =False),tablefmt='latex')
aov_table = aov_table.round({'F': 2, 'PR(>F)': 2,'eta_sq':2})
print tabulate(aov_table[['F','PR(>F)','eta_sq']].sort_values('F',ascending =False),tablefmt='latex')

In [None]:
# print(model.summary().as_latex())

## Sensitivity analysis on CVS criteria using BIC

In [None]:
# for CVS in ['CVS_levels_2','CVS_levels_3','CVS_levels_4','CVS_levels_5']:
#     formula = 'main ~ C('+CVS+')' + covariates_for_parsimonious_model
#     model = ols(formula, filtered_maxpre_data).fit()
#     print "\n\nModel: ",formula
#     aov_table = anova_lm(model, typ=3)
#     eta_squared(aov_table)
#     omega_squared(aov_table)
#     print "\nAnova table using type 3\n"
#     print(aov_table)
#     print "\nHere is the linear model with coefficients and confidence intervals (removed stats for individual student ids):\n"
#     print clean_summary(model)

These results were produced using AOV of linear regression with Type III SS

We think a threshold of 3 pts is stricks a balance between 1) measuring more deliberatness than 2 pts and 2) it is less stringent than using more points.

Here are the AIC and BIC of the models (lower means better model controlling for number of parameters)


    no interactions
    #pts	R^2
    2		0.449
    3		0.447
    4		0.443
    5		0.428

None of these models are drastically different from each other, thought 5pts is definitely the worst

We pick #pts = 3 and call it a day

In [None]:
results = [['Threshold','Concentration ','Width ','Area ','Separation','$R^2$']]
for t,r in zip([2,3,4,5],[0.449,0.447,0.443,0.428]):
    res = [t]
    res.extend([round(sum(data[data['variable']==v]['CVS_graph_'+str(t)])/float(N),2) for v in ['Concentration','Width','Area','Separation']])
    res.append(r)
    results.append(res)
print tabulate(results,tablefmt='latex')

### Models per variable

In [None]:
per_variable = pd.pivot_table(filtered_maxpre_data, values=['main'], index=['variable','CVS_levels_3'],aggfunc=(np.mean))
per_variable.plot(kind='bar')
print "This plot needs to be redone with standard deviation bars and organized by sim, colored by level"

In [None]:
covariates_for_variable_models = " + pre + sim_index + level_experience_sims + used_similar_sim + use_graph"

In [None]:
for variable in ['Width', 'Concentration','Area','Separation']:
    print "__________________________________\nFOR VARIABLE ", variable, '\n__________________________________'
    formula = 'main ~ C(CVS_levels_3)' + covariates_for_variable_models
    model = ols(formula, filtered_maxpre_data[filtered_maxpre_data['variable']==variable]).fit()
    print "\n\nModel: ",formula
    aov_table = anova_lm(model, typ=3)
    eta_squared(aov_table)
    omega_squared(aov_table)
    print "\nAnova table using type 3 errors\n"
    print(aov_table)
#     print "\nHere is the linear model with coefficients and confidence intervals:\n"
    print clean_summary(model)

### A closer look at CVS graph with inverse scale for Separation

In [None]:
graph_inverse_cvs_df = pd.read_csv('graph_inverse_cvs_df.txt', sep='\t')
graph_inverse_cvs_df['used_inverse_separation']=graph_inverse_cvs_df['Separation']>3
graph_inverse_cvs_df['sid']=graph_inverse_cvs_df['studentid']

In [None]:
variable ='Separation'
temp = filtered_maxpre_data[filtered_maxpre_data['variable']==variable]
separation_data = temp.merge(graph_inverse_cvs_df, on=['sid'], how='inner')
print "__________________________________\nFOR VARIABLE ", variable, '\n__________________________________'
formula = 'main ~ C(CVS_levels_3)*used_inverse_separation' + covariates_for_variable_models
model = ols(formula, separation_data).fit()
print "\n\nModel: ",formula
aov_table = anova_lm(model, typ=3)
eta_squared(aov_table)
omega_squared(aov_table)
print "\nAnova table using type 3 errors\n"
print(aov_table)
#     print "\nHere is the linear model with coefficients and confidence intervals:\n"
print clean_summary(model)

In [None]:
separation_data[(separation_data['CVS_levels_3']==2)&(separation_data['used_inverse_separation']==False)]['main'].unique()

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=3,figsize=(15,5))
sns.countplot(data=separation_data[(separation_data['main']!=0)&(separation_data['CVS_levels_3']!=2)],y='main',ax=ax[0],label='main score',color='blue',alpha=0.7);
sns.countplot(data=separation_data[(separation_data['main']!=0)&(separation_data['CVS_levels_3']==2)&(separation_data['used_inverse_separation']==False)],y='main',ax=ax[1],label='main score',color='blue',alpha=0.7);
sns.countplot(data=separation_data[(separation_data['main']!=0)&(separation_data['CVS_levels_3']==2)&(separation_data['used_inverse_separation']==True)],y='main',ax=ax[2],label='main score',color='blue',alpha=0.7);
ax[0].set(xlabel='Number of students')
ax[1].set(xlabel='Number of students')
ax[2].set(xlabel='Number of students')
ax[0].set(ylabel='main score if didn\'t do CVS graph')
ax[1].set(ylabel='main score if did CVS graph')
ax[2].set(ylabel='main score if did CVS graph with inverse scale')
ax[0].set(yticklabels = ['Identify','Qualitative','Quantitative'])
ax[1].set(yticklabels = ['','','',''])
ax[2].set(yticklabels = ['','','',''])
ax[0].set(xlim=(0,60))
ax[1].set(xlim=(0,60))
ax[2].set(xlim=(0,60))
print "I removed the students with 'all incorrect/None' main score"

## Post-hoc analysis on levels using 3pts as threshold

In [None]:
for ignore in [0,1,2]:
    formula = 'main ~ C(CVS_levels_3)' + covariates_for_parsimonious_model
    model = ols(formula, filtered_maxpre_data[filtered_maxpre_data[CVS]!=ignore]).fit()
    print "\n\nModel: ",formula
    print "Comparing CVS levels ignoring level ",ignore
    aov_table = anova_lm(model, typ=3)
    eta_squared(aov_table)
    omega_squared(aov_table)
    print "\nAnova table using type 3 errors\n"
    print(aov_table)
    print clean_summary(model)

In [None]:
levels = pd.pivot_table(filtered_maxpre_data, values=['main'], index=['CVS_levels_3'],aggfunc=np.mean)
levels.plot(kind='barh')
print "This plot needs to be redone with standard deviation bars and colored by level and stars given the tests above"
print '0-1 => -'
print '1-2 => *'
print '0-2 => ***'

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=3,figsize=(12,5))
for N in [0,1,2]:
    sns.countplot(data=filtered_maxpre_data[(filtered_maxpre_data['CVS_levels_3']==N)],y='main',ax=ax[N],label=str(N),color='blue',alpha=0.7);
    ax[N].set(ylabel='')
    ax[N].set(xlabel='')
    ax[N].set(yticklabels = ['','','',''])
    ax[N].set(xlim=(0,180))
ax[0].set(ylabel='main score')
ax[1].set(xlabel='Number of students')
ax[0].set(yticklabels = ['None-all incorrect','Identify','Qualitative','Quantitative']);

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=4,figsize=(12,5))
for N,v in enumerate(['Separation',"Area",'Concentration','Width']):
    sns.countplot(data=filtered_maxpre_data[(filtered_maxpre_data['main']>1)&(filtered_maxpre_data['CVS_levels_3']==2)&(filtered_maxpre_data['variable']==v)],y='main',ax=ax[N],label=str(N),color='blue',alpha=0.7);
    ax[N].set(ylabel='')
    ax[N].set(xlabel=v)
    ax[N].set(yticklabels = ['','','',''])
    ax[N].set(xlim=(0,50))
ax[0].set(ylabel='main score')
# ax[1].set(xlabel='Number of students')
ax[0].set(yticklabels = ['Qualitative','Quantitative']);
# ["Area",'Concentration','Separation','Width']

### Post-hoc analysis on experience in physic undergraduate labs

In [None]:
# posthoc_exp_physics_labs = pd.pivot_table(data, values=['main'], index=['experience_undergrad_labs'],aggfunc=(np.mean))
# posthoc_exp_physics_labs.plot(kind='bar')
# print "This plot needs to be redone with standard deviation bars and organized by sim, colored by level"

### Post-hoc analysis on order

In [None]:
# posthoc_sim_index = pd.pivot_table(data, values=['main'], index=['sim_index'],aggfunc=(np.mean))
# posthoc_sim_index.plot(kind='bar')
# print "This plot needs to be redone with standard deviation bars and organized by sim, colored by level"

### Post-hoc analysis on variable

In [None]:
# posthoc_variable = pd.pivot_table(data, values=['main'], index=['variable'],aggfunc=(np.mean))
# posthoc_variable.plot(kind='bar')
# print "This plot needs to be redone with standard deviation bars and organized by sim, colored by level"

Findings:
* CVS_levels=2 (graph) matters for all except Separation
* Pre matters for all except Width

## Repeat model for only student who use graph (85%)

In [None]:
formula = 'main ~ C(CVS_levels_3)' + covariates_for_parsimonious_model
model = ols(formula, filtered_maxpre_data[(filtered_maxpre_data[CVS]!=0)&(filtered_maxpre_data['use_graph']==1)]).fit()
print "\n\nModel: ",formula
aov_table = anova_lm(model, typ=3)
eta_squared(aov_table)
omega_squared(aov_table)
print "\nAnova table using type 3 errors\n"
print(aov_table)
print clean_summary(model)

# What affects use CVS-like inquiry strategies?

For some reason adding students in the matric turns it into a singular matrix and fails the analysis. Probably because two students are exactly identical?

## Overall, how much do they use it?

In [None]:
data['CVS_table'] = data['CVS_table_3']
data['CVS_graph'] = data['CVS_graph_3']
sums = pd.pivot_table(data, values=['CVS_table','CVS_graph'], index=['sid'], aggfunc=np.sum)

In [None]:
print data.shape

In [None]:
for threshold in ['2','3','4','5']:
    print '\nFor CVs with {0} pts as a threshold'.format(threshold)
    sums = pd.pivot_table(data, values=['CVS_table_'+threshold,'CVS_graph_'+threshold], index=['sid'], aggfunc=np.sum)
    once = len(sums[sums['CVS_table_'+threshold]>0])/float(N)*100
    all4 = len(sums[sums['CVS_table_'+threshold]==4])/float(N)*100
    print "   {0}% of students use CVS table once and {1}% do it for all variables.".format(int(once),int(all4))
    once = len(sums[sums['CVS_graph_'+threshold]>0])/float(N)*100
    all4 = len(sums[sums['CVS_graph_'+threshold]==4])/float(N)*100
    print "   {0}% of students use CVS graph once and {1}% do it for all variables.".format(int(once),int(all4))

## Are they consistent in their usage of CVS graph?

In [None]:
sums2 = pd.pivot_table(data, values=['CVS_graph'], index=['sid','sim'], aggfunc=np.sum)
sums2 = sums2.reset_index(level=['sim','sid'])
# sums2.head()

In [None]:
cvs_sim = np.zeros((3,3),dtype=int)
for i in range(3):
    for j in range(3):
        cvs_sim[i,j] = len(set(sums2[(sums2['sim']=='C')&(sums2['CVS_graph']==i)]['sid'])&set(sums2[(sums2['sim']=='L')&(sums2['CVS_graph']==j)]['sid']))
ax = sns.heatmap(np.flip(cvs_sim,0),annot=True, fmt="d")
ax.set_xticklabels(['0/2','1/2','2/2'])
ax.set_yticklabels(['0/2','1/2','2/2'])
plt.xlabel('Number of light absorbance variables')
plt.ylabel('Number of Caps variables')
plt.title('Map of student use of CVS in their graph for variables per simulation')

High usage is consistent. If they do CVS with 1 variable in one sim, they do it with 2 in the other (probaly second sim).We'll see how order makes a difference later.

## What affects use of CVS
### Model with interaction (killed)

In [None]:
# formula = 'CVS_graph ~ pre*variable + sim_index*variable + pre*sim_index + level_experience_sims + experience_undergrad_labs + used_similar_sim'
# print 'model: ', formula,'\n'
# from patsy import dmatrices
# Y, X = dmatrices(formula, data, return_type = 'dataframe')
# # print X.columns
# logit = Logit(Y, X)
# model = logit.fit()
# print model.summary()
# # note that stats model has no module for running an anova on a logistic regression model

### Parismonious model

In [None]:
formula = 'CVS_graph ~ pre + variable + sim_index +level_experience_sims + experience_undergrad_labs + used_similar_sim'
print 'model: ', formula,'\n'
from patsy import dmatrices
Y, X = dmatrices(formula, data, return_type = 'dataframe')
# print X.columns
logit = Logit(Y, X)
model = logit.fit()
print model.summary()
# aov_table = anova_lm(model)
# eta_squared(aov_table)
# omega_squared(aov_table)
# # print "\nAnova table using type 2 errors\n"
# print(aov_table)

In order of what affects CVS_graph most:
* experience in undergraduate labs
* order
* prior knowledge

Having used a similar sim doesn't matter, variable doesn't matter and sim doesn't matter.

Experience in sims mattered before we included both physics and chem experience in labs

In [None]:
model.summary()

In [None]:
print model.summary().as_latex()

### post hoc of experience in undergraduate labs

In [None]:
# cvs_exp = pd.pivot_table(data, values=['CVS_graph'], index=['experience_undergrad_labs'],aggfunc=(np.mean,np.std))
# print "Here is prob that they do CVS graph depending on the experience_undergrad_labs"
# cvs_exp['CVS_graph']['mean'].plot.bar(yerr=cvs_exp['CVS_graph']['std'])

### post hoc of activity order

In [None]:
# cvs_exp = pd.pivot_table(data, values=['CVS_graph'], index=['sim_index'],aggfunc=(np.mean,np.std))
# print "Here is prob that they do CVS graph depending on sim_index"
# cvs_exp['CVS_graph']['mean'].plot.bar(yerr=cvs_exp['CVS_graph']['std'])

### post hoc of experience with sims

In [None]:
# cvs_exp = pd.pivot_table(data, values=['CVS_graph'], index=['level_experience_sims'],aggfunc=(np.mean,np.std))
# print "Here is prob that they do CVS graph depending on level_experience_sims"
# cvs_exp['CVS_graph']['mean'].plot.bar(yerr=cvs_exp['CVS_graph']['std'])
# print '''where:
#  0 -> None
#  1 -> 1-2 (roughly)
#  2 -> 3-5 (roughly)
#  3 -> 6+ (roughly)'''

### post hoc of pre

In [None]:
# cvs_pre = pd.pivot_table(data, values=['CVS_graph'], index=['pre'],aggfunc=(np.mean,np.std))
# print "Here is the avg pre score of students depending if they do CVS graph or not"
# cvs_pre['CVS_graph']['mean'].plot.bar(yerr=cvs_pre['CVS_graph']['std'])