In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [492]:
csvfile0 = 'All coded data 2.csv' 
df0 = pd.read_csv(csvfile0, sep='\t')
print df0.dtypes
print df0.shape

Student ID                                 int64
Sim                                        int64
Sim Topic                                 object
Q                                          int64
Statement                                 object
Task Definition (TD)                      object
Task Affect (TA)                          object
Virtual Lab Feedback (TF)                 object
Generate global ideas (GI)                object
Planning (P)*                             object
Exploration (E)                           object
Experimentation (EX)                      object
Collect data (EX-C)                       object
CVS (EX-CVS)                              object
Interpretation                            object
Feedback (F)                              object
Identifying relevant variables (IV)       object
Plot data (EX-P)                          object
Modelling (M)                             object
Qualitative data modelling                object
Quantitative data mo

In [493]:
for col in df0.columns:
    if df0[col].dtypes != 'object':
        df0[col] = df0[col].fillna(-88)
        df0[col] = df0[col].astype(int)
        df0[col] = df0[col].astype(str)
        df0[col] = df0[col].replace('-88', np.nan)
        
df0= df0.astype(str)

In [494]:
df0.head()

Unnamed: 0,Student ID,Sim,Sim Topic,Q,Statement,Task Definition (TD),Task Affect (TA),Virtual Lab Feedback (TF),Generate global ideas (GI),Planning (P)*,...,Feedback (F),Identifying relevant variables (IV),Plot data (EX-P),Modelling (M),Qualitative data modelling,Quantitative data modelling,Drawing conclusions (C),Evaluation of domain knowledge (EV-K),Judgement of learning (JoL),Awareness of knowledge gaps (KG)
0,4356596,1,Absorbance,1,I identified the important factors in the lab ...,,,,,,...,1,1,,,,,,,,
1,4356596,1,Absorbance,2,I think it was good to be able to test and exp...,,,,,,...,1+,1+,0+,1+,,,,,,
2,7868168,1,Absorbance,1,I tested out the different variables to see wh...,1-,,,,,...,1,,1+,,,,,,,
3,7868168,1,Absorbance,2,I would spend more time taking specific trials...,,,,0+,,...,,,,,,,0+,,,
4,10105157,1,Absorbance,1,I tried different tests where I doubled certai...,,,,,,...,1,1,1,,,1.0,,,,


In [495]:
student_info_headers = df0.columns[0:4]
code_headers = df0.columns[5:]
print student_info_headers, code_headers

Index([u'Student ID', u'Sim', u'Sim Topic', u'Q'], dtype='object') Index([u'Task Definition (TD)', u'Task Affect (TA)',
       u'Virtual Lab Feedback (TF)', u'Generate global ideas (GI)',
       u'Planning (P)*', u'Exploration (E)', u'Experimentation (EX)',
       u'Collect data (EX-C)', u'CVS (EX-CVS)', u'Interpretation',
       u'Feedback (F)', u'Identifying relevant variables (IV)',
       u'Plot data (EX-P)', u'Modelling (M)', u'Qualitative data modelling',
       u'Quantitative data modelling', u'Drawing conclusions  (C)',
       u'Evaluation of domain knowledge (EV-K)',
       u'Judgement of learning (JoL)', u'Awareness of knowledge gaps (KG)'],
      dtype='object')


### Clean data by removing students that don't match worksheet data and matching typos in IDs

In [496]:
#these are students that have more than the expected 4 entries (2 statements per sim)
students = df0['Student ID'].unique()
for student in students:
    if sum(df0['Student ID'] == student) != 4:
        print student, sum(df0['Student ID'] == student)

7868168 2
11691167 6
13493169 2
17595160 2
17931169 8
23784336 8
31607164 2
13052167 6
13615168 2
84135167 2
13165168 2
83145167 2
17597160 2
17868168 2
36107164 2


In [497]:
df0[df0['Student ID']=='17931169'][list(student_info_headers) +['Statement']]

Unnamed: 0,Student ID,Sim,Sim Topic,Q,Statement
138,17931169,1,Absorbance,1,I tested different variables randomly trying t...
139,17931169,1,Absorbance,2,I liked that the lab was sort of like a game. ...
348,17931169,2,Absorbance,1,I think I learned more in the 2nd lab because ...
349,17931169,2,Absorbance,2,Testing out different solution concentrations ...
546,17931169,1,Capacitance,1,I'm not very good at physics but the virtual l...
547,17931169,1,Capacitance,2,Testing different factors and being able to se...
746,17931169,2,Capacitance,1,I played around with the plate separation and ...
747,17931169,2,Capacitance,2,Being able to test different variables and imm...


In [498]:
csvfile_ws = 'dataframe_all_factors_for_analysis.txt' 
dfws = pd.read_csv(csvfile_ws, sep='\t')
dfws['sid']=dfws['sid'].astype(str)
# print dfws.dtypes
# print dfws.shape

#Print number of unique students in worksheet dataset
print len(dfws['sid'].unique())

# print set(list(df0['Student ID'])) - set(list(dfws['sid']))

#Print students that are not in reflection data that are in worksheet dataset
print set(list(dfws['sid'])) - set(list(df0['Student ID']))

147
set(['10192168', '10561164'])


#### The following students are the same:
* discovered these matches from within the reflection dataset (double checked with IP in raw data), map to new ID that is consistent with ID in worksheet dataset

{7868168:17868168, 84135167:83145167, 17595160;17597160, 31607164:36107164, 13615168:13165168}

* discovered these matches across worksheet and reflection dataset

{'561164':'10561164', '192168':'10192168'}

#### The following describes the resolutions for the other problematic ids
* 11691167 delete blank one with 99 and capacitor becomes order 1 determined by timestamps and reflection content
* 13493169 missing data for second sim, not sure why, checked IP and other conditions for that date
* 17931169 and 23784336 duplicates but the two members in each pair were in different condition so can easily distinguish (17931169 that became 17931168 was part of the March session for which we lost log data)
* 13052167 somehow duplicated while coding: delete one of duplicate entries for absorbance
*19196162 somehow not coded but was in original dataset for coding

In [499]:
# Fix 17931169 duplicate student as described above
df0.loc[[138,139,746,747],'Student ID'] = ['17931168','17931168','17931168','17931168']

In [500]:
# Drop empty data for student 11691167
df0.drop([262,263],inplace=True)

In [501]:
# Fix typo IDs from worksheet data and from within survey data
SIDerrorMapper = {'561164':'10561164', '192168':'10192168', '7868168':'17868168', 
                  '84135167':'83145167', '17595160':'17597160', '31607164':'36107164', 
                  '13615168':'13165168'}

for key in SIDerrorMapper.keys():
    df0.loc[df0['Student ID'] == key, 'Student ID'] = SIDerrorMapper[key]

#Double-check that student ids match
print set(list(dfws['sid'])) - set(list(df0['Student ID']))

set([])


In [502]:
# Filter out all unmatched IDs from dataset
df1 = df0[df0['Student ID'].isin(list(dfws['sid']))]
# Check that number of students remaining matches original worksheet dataset numbers
print df1['Student ID'].nunique()

147


In [503]:
# dfmerged = df1.groupby(['Student ID', 'Sim', 'Sim Topic']).agg(lambda x: ';'.join(x)).reset_index()
# dfmerged.head()

In [504]:
# dfmerged = dfmerged[list(student_info_headers)+list(code_headers)]
# dfmerged.to_csv(path_or_buf='all_coded_data_merged.csv', sep=',',index=None)

In [505]:
v = df1[code_headers].stack()
v[:] = v.str.translate(None, deletechars='01')
# v[:] = v.str.replace(r'\d+', '')
v[v=='']='nan'
dfvals = df1.copy()
dfvals[code_headers] = v.unstack()
# dfvals.head()

In [506]:
dfvals_merged = dfvals.groupby(['Student ID', 'Sim', 'Sim Topic']).agg(lambda x: ';'.join(x))
# dfvals_merged.head()

In [507]:
mergemap = {'nan;nan':np.nan, '+;+':'+','-;+':'m','+;-':'m','-;-':'-','nan;+':'+','+;nan':'+','nan;-':'-','-;nan':'-'}
dfvals_merged.replace(mergemap, inplace=True)
dfvals_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Q,Statement,Task Definition (TD),Task Affect (TA),Virtual Lab Feedback (TF),Generate global ideas (GI),Planning (P)*,Exploration (E),Experimentation (EX),Collect data (EX-C),...,Feedback (F),Identifying relevant variables (IV),Plot data (EX-P),Modelling (M),Qualitative data modelling,Quantitative data modelling,Drawing conclusions (C),Evaluation of domain knowledge (EV-K),Judgement of learning (JoL),Awareness of knowledge gaps (KG)
Student ID,Sim,Sim Topic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
10127163,1,Absorbance,1;2,I initially tried messing with the different f...,,,,,,+,,,...,,,+,,,,,,,
10127163,2,Capacitance,1;2,"Other than the topics explored, I do not think...",,,,,,,+,+,...,,,+,,,,,,,
10192168,1,Capacitance,1;2,I looked for patterns and made a graph\;I noti...,,,,,,,,,...,,,,+,,,,,,
10192168,2,Absorbance,1;2,"I liked the first lab more, but this lab was i...",,+,,,,,+,,...,,,+,,,,,,,
10232160,1,Absorbance,1;2,I learned about physics through changing vario...,,,,,,,+,,...,,,,,,,,,,


In [508]:
v = df1[code_headers].stack()
v[:] = v.str.translate(None, deletechars='+-')
# v[:] = v.str.replace(r'\d+', '')
dfnums = df1.copy()
dfnums[code_headers] = v.unstack()
# dfnums.head()

In [509]:
dfnums_merged = dfnums.groupby(['Student ID', 'Sim', 'Sim Topic']).agg(lambda x: ';'.join(x))
# dfnums_merged.head()

In [510]:
mergemap = {'nan;nan':np.nan, '1;1':1,'0;1':1,'1;0':1,'0;0':0,'nan;1':1,'1;nan':1,'nan;0':0,'0;nan':0}
dfnums_merged.replace(mergemap, inplace=True)
dfnums_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Q,Statement,Task Definition (TD),Task Affect (TA),Virtual Lab Feedback (TF),Generate global ideas (GI),Planning (P)*,Exploration (E),Experimentation (EX),Collect data (EX-C),...,Feedback (F),Identifying relevant variables (IV),Plot data (EX-P),Modelling (M),Qualitative data modelling,Quantitative data modelling,Drawing conclusions (C),Evaluation of domain knowledge (EV-K),Judgement of learning (JoL),Awareness of knowledge gaps (KG)
Student ID,Sim,Sim Topic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
10127163,1,Absorbance,1;2,I initially tried messing with the different f...,,,,,,1.0,,1.0,...,,1.0,1.0,,1.0,1.0,,,,
10127163,2,Capacitance,1;2,"Other than the topics explored, I do not think...",1.0,,,,,,0.0,1.0,...,,1.0,1.0,1.0,,,,,1.0,1.0
10192168,1,Capacitance,1;2,I looked for patterns and made a graph\;I noti...,,,,,,,,,...,,,1.0,0.0,1.0,,,,,
10192168,2,Absorbance,1;2,"I liked the first lab more, but this lab was i...",,1.0,,,,,0.0,,...,,,1.0,,1.0,,,,1.0,
10232160,1,Absorbance,1;2,I learned about physics through changing vario...,1.0,,,,,1.0,1.0,,...,,1.0,,,,1.0,,,1.0,


In [511]:
def mergecolumns(dfm, dataname, newcol, colname1, colname2):

    dfm[newcol] = dfm[[colname1, colname2]].apply(lambda row: ';'.join(row.astype(str).replace('\.0', '', regex=True)), axis=1)
    if dataname == 'numbers':
        mergemap = {'nan;nan':np.nan, '1;1':1,'0;1':1,'1;0':1,'0;0':0,
                    'nan;1':1,'1;nan':1,'nan;0':0,'0;nan':0}
    elif dataname == 'values':
        mergemap = {'nan;nan':np.nan, '+;+':'+','-;+':'m','+;-':'m','-;-':'-',
                    'nan;+':'+','+;nan':'+','nan;-':'-','-;nan':'-', 
                    'm;+':'m', '+;m':'m', 'm;-':'m', '-;m':'m',
                   'm;nan':'m', 'nan;m':'m', 'm;m':'m'}
    else:
        error('unrecognized df identifier name: must be numbers or values')
        
    dfm.replace(mergemap, inplace=True)
    return dfm

In [512]:
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'TD-TA', 'Task Definition (TD)', 'Task Affect (TA)')
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'EX-C', 'Experimentation (EX)', 'Collect data (EX-C)')
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'EX-C-CVS', 'EX-C', 'CVS (EX-CVS)')
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'Ql-Qt', 'Qualitative data modelling', 'Quantitative data modelling')
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'M-Ql-Qt', 'Ql-Qt', 'Modelling (M)')
dfnums_merged = mergecolumns(dfnums_merged, 'numbers', 'F-P', 'Feedback (F)', 'Plot data (EX-P)')

In [513]:
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'TD-TA', 'Task Definition (TD)', 'Task Affect (TA)')
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'EX-C', 'Experimentation (EX)', 'Collect data (EX-C)')
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'EX-C-CVS', 'EX-C', 'CVS (EX-CVS)')
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'Ql-Qt', 'Qualitative data modelling', 'Quantitative data modelling')
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'M-Ql-Qt', 'Ql-Qt', 'Modelling (M)')
dfvals_merged = mergecolumns(dfvals_merged, 'values', 'F-P', 'Feedback (F)', 'Plot data (EX-P)')

In [514]:
dfws['sim'].replace({'L':'Absorbance','C':'Capacitance'}, inplace=True)
dfws = dfws.rename(columns={'sid':'Student ID', 'sim':'Sim Topic'})

In [515]:
dfws[dfws['Student ID']=='10420167']

Unnamed: 0,Student ID,Sim Topic,variable,pre,main,cvs_graph,cvs_table,qual_score,quant_score,activity_order,...,pre_with_ident,main_with_ident,CVS_context,use_table,use_graph,use_concentration,use_width,use_area,use_separation,use_all_vars
16,10420167,Absorbance,Concentration,0.0,2.0,1,1,1.0,1.0,CL,...,1.0,3.0,2,1,1,1,1,1,1,4
17,10420167,Absorbance,Width,0.0,2.0,1,1,0.0,0.0,CL,...,0.0,3.0,2,1,1,1,1,1,1,4
18,10420167,Capacitance,Area,1.0,2.0,1,1,1.0,1.0,CL,...,2.0,3.0,2,1,1,1,1,1,1,4
19,10420167,Capacitance,Separation,1.0,1.0,1,1,1.0,0.0,CL,...,2.0,2.0,2,1,1,1,1,1,1,4


In [516]:
model_scores = dfws.iloc[:,0:5].groupby(['Student ID','Sim Topic']).mean().reset_index()

In [517]:
model_scores.head()

Unnamed: 0,Student ID,Sim Topic,pre,main
0,10127163,Absorbance,0.0,2.0
1,10127163,Capacitance,2.0,2.0
2,10192168,Absorbance,0.0,1.0
3,10192168,Capacitance,0.0,1.0
4,10232160,Absorbance,0.0,0.0


In [524]:
dfvals_merged2 = pd.merge(dfvals_merged.reset_index(), model_scores, how='inner', on=['Student ID','Sim Topic'])
dfnums_merged2 = pd.merge(dfnums_merged.reset_index(), model_scores, how='inner', on=['Student ID','Sim Topic'])
# print test2.head()

In [525]:
dfvals_merged2.to_csv(path_or_buf='all_coded_data_VALUES_merged_withModelScores.csv', sep=',',index=None)
dfnums_merged2.to_csv(path_or_buf='all_coded_data_NUMBERS_merged_withModelScores.csv', sep=',',index=None)