In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

SHEET_NAMES = ["Pre & Post", "Weekly", "Daily"]

In [2]:
df = pd.read_csv("data/class_data.csv", encoding = 'latin-1')
df_interested = df[df["pre_studyinterest"] == "Yes"]

In [60]:
# construct possible varnames for each Name Stem in its Construct group
# flag in ["pre", "post", 'd', 'w']
def build_counstruct_variable_dict(codebook, flag):
    appending = [c for c in list(codebook.columns)[1:list(codebook.columns).index("Construct")]]
    if "pre" in appending:
        construct_var_dict = \
            codebook.groupby("Construct").apply(
                lambda d: {i: [a + "_" + i for a in appending if d[d["Name Stem"] == i][a].tolist()[0] == "x"] \
                           for i in d["Name Stem"]}).to_dict()
            
    else:
        construct_var_dict = \
            codebook.groupby("Construct").apply(
                lambda d: {i: [i + flag + str(a) for a in appending if d[d["Name Stem"] == i][a].tolist()[0] == "x"] \
                           for i in d["Name Stem"]}).to_dict()
    return construct_var_dict

# Verify possuble varnames by checking in bio class variables set
def find_construct_variable(df, codebook, flag):
    construct_var_dict = build_counstruct_variable_dict(codebook, flag)
    return {construct: {var_root: [var for var in construct_var_dict[construct][var_root] + [var_root] if var in df.columns] \
                          for var_root in construct_var_dict[construct]} for construct in construct_var_dict}

def print_non_exist_variable(df, to_ret):
    print("Name Stems that are not in df:  ")
    for construct in to_ret:
        for var_root in to_ret[construct]:
            if len(to_ret[construct][var_root]) == 0:
                print([construct, var_root])
    print('\n')

def build_var_null_percentage_df(df, sheet_name, verbal = False, overwrite = True, student_interested = False, writer = None):
    
    def construct_percentage_df(construct_var_dict):
        df_percentage_master = pd.DataFrame()
        for construct in construct_var_dict:
            found_var = [var for var_root in construct_var_dict[construct] for var in construct_var_dict[construct][var_root]]
            df_construct_sub = df[found_var]
            to_view = pd.DataFrame(df_construct_sub.count()).reset_index()
            to_view.columns = ["var", "count"]
            to_view["Name Stem"] = [var_root for var_root in construct_var_dict[construct] for var in construct_var_dict[construct][var_root]]
            to_view["Construct"] = construct
            df_percentage_master = pd.concat([df_percentage_master, to_view], axis = 0)
        df_percentage_master["flag"] = [i.replace(j, '').replace('_', '') if i.replace(j, '').replace('_', '') else "original" 
                       for i, j in zip(df_percentage_master["var"], df_percentage_master["Name Stem"])]
        df_percentage_master = pd.DataFrame(df_percentage_master.pivot(index='Name Stem', columns="flag", values=["count"]))
        return df_percentage_master
    
    def write_to_excel(df_percentage, df_codebook, sheet_name = sheet_name, writer = writer):
        appending = [c for c in list(df_codebook.columns)[1:list(df_codebook.columns).index("Construct")]]
        df_percentage_level = pd.DataFrame(df_percentage["count"]).reset_index()
        df_percentage_level = df_percentage_level.merge(df_codebook[codebook_vars], 
                                                       left_on = "Name Stem", right_on = "Name Stem", how = "left")   
        if sheet_name == "Pre & Post":
            df_percentage_level[df_percentage_level["original"].notna()].drop(["post", "pre"], axis=1).to_excel(writer, sheet_name = "Pre&Post Original")
            df_percentage_level[df_percentage_level["post"].notna()].drop(["original", "pre"], axis=1).to_excel(writer, sheet_name = "Post")
            df_percentage_level[df_percentage_level["pre"].notna()].drop(["original", "post"], axis=1).to_excel(writer, sheet_name = "Pre")
            df_percentage_level = df_percentage_level[(df_percentage_level["pre"].notna()) & (df_percentage_level["post"].notna())].drop(["original"], axis=1)
            df_percentage_level["retention_rate"] = df_percentage_level["post"] / df_percentage_level["pre"]
            df_percentage_level = df_percentage_level[["Name Stem", "pre", "post", "retention_rate", "Construct", "Item", "Response Values", "Label"]]
            df_percentage_level.to_excel(writer, sheet_name = "Pre & Post")
        else:
            df_percentage_level.to_excel(writer, sheet_name = sheet_name)    
    
    def print_percentage_head(df_percentage, df_codebook):
        df_percentage.columns = ['_'.join(col[::-1]).strip() for col in df_percentage.columns.values]
        df_percentage = pd.DataFrame(df_percentage).reset_index().merge(df_codebook[codebook_vars], 
                                                                   left_on = "Name Stem", right_on = "Name Stem", how = "left")
        df_percentage = df_percentage[codebook_vars + [c for c in df_percentage.columns if c not in codebook_vars]]
        display(df_percentage.head())
    
    df_codebook =  pd.read_excel("data/codebook.xlsx", sheet_name=sheet_name)
    construct_var_dict = find_construct_variable(df, df_codebook, sheet_name[0].lower())
    df_percentage = construct_percentage_df(construct_var_dict)
    codebook_vars = ["Name Stem", "Construct", "Item", "Response Values", "Label"]
    
    if verbal:
        print(sheet_name, '\n\n')
        print_non_exist_variable(df, construct_var_dict)
        print_percentage_head(df_percentage.copy(), df_codebook)
    
    if overwrite:
        write_to_excel(df_percentage.copy(), df_codebook)
  
    return df_percentage
        

## Interested Students

In [61]:
fname = "data/student_interested_var_count.xlsx"
print("Total number of students:  ", df_interested.shape[0])
with pd.ExcelWriter(fname) as writer:
    for sheet_name in SHEET_NAMES:
        build_var_null_percentage_df(df_interested, sheet_name, verbal = True, student_interested=True, writer = writer)

Total number of students:   99
Pre & Post 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Self-Concept of Ability', 'abil_gen_bio1']
['Self-Concept of Ability', 'abil_gen_bio3']
['Self-Concept of Ability', 'abil_gen_chem1']
['Self-Concept of Ability', 'abil_gen_chem3']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phone']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,original_count,post_count,pre_count
0,aca1,Academic Integration,"Talk with faculty about academic matters, outs...",(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter talking with faculty ac...,,70.0,
1,aca2,Academic Integration,Meet with an academic advisor concerning acade...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter meeting with academic a...,,70.0,
2,aca3,Academic Integration,Meet with a student mentor concerning course a...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter meeting with student me...,,70.0,
3,aca4,Academic Integration,Attend study groups outside of the classroom,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter attending study groups ...,,69.0,
4,aca5,Academic Integration,Have informal or social contacts with faculty ...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter talking with faculty so...,,69.0,


Weekly 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phone']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,original_count,w2_count,w3_count,w4_count,w5_count
0,att3,Attainment Value,How important to your identity is it to be kno...,"slider: 1=Not at all important, 7=Very important",important to my identity to know cooking science,,80.0,71.0,67.0,65.0
1,badgradec,Grade Expectations - Course,Think about your grade in this course...what’s...,slider: 0-100%,worst course grade still satisfactory,,82.0,72.0,69.0,67.0
2,badgradef,Grade Expectations - Final,Think about your grade on the final...what’s t...,slider: 0-100%,worst final grade still satisfactory,,,,69.0,67.0
3,badgradem,Grade Expectations - Midterm,Think about your grade on the midterm...what’s...,slider: 0-100%,worst midterm grade still satisfactory,,82.0,,,
4,badgradeq,Grade Expectations - Quiz,Think about your grade on this week's review q...,slider: 0-100%,worst quiz grade still satisfactory,,82.0,72.0,69.0,67.0


Daily 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phonemodel']
['Study Participation', 'phonecarrier']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,d1_count,d2_count,d3_count,d4_count,d5_count,d6_count,original_count
0,dadd,Reflection on Previous Day Activities,Was there additional course-related activity y...,"0=No, 1=Yes",additional course activity yesterday,,62.0,63.0,62.0,61.0,47.0,
1,daddo,Reflection on Previous Day Activities,"If so, what else did you do?",open-ended,additional course activity - text,,5.0,12.0,6.0,7.0,10.0,
2,dcact1,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 1 (day 1),,,,,,,60.0
3,dcact10,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 10 (day 2),,,,,,,2.0
4,dcact11,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 11 (day 3),,,,,,,57.0


## All Students

In [62]:
fname = "data/student_var_count.xlsx"
print("Total number of students:  ", df.shape[0])
with pd.ExcelWriter(fname) as writer:
    for sheet_name in SHEET_NAMES:
        build_var_null_percentage_df(df, sheet_name, verbal = True, student_interested=False, writer = writer)

Total number of students:   169
Pre & Post 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Self-Concept of Ability', 'abil_gen_bio1']
['Self-Concept of Ability', 'abil_gen_bio3']
['Self-Concept of Ability', 'abil_gen_chem1']
['Self-Concept of Ability', 'abil_gen_chem3']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phone']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,original_count,post_count,pre_count
0,aca1,Academic Integration,"Talk with faculty about academic matters, outs...",(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter talking with faculty ac...,,100.0,
1,aca2,Academic Integration,Meet with an academic advisor concerning acade...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter meeting with academic a...,,100.0,
2,aca3,Academic Integration,Meet with a student mentor concerning course a...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter meeting with student me...,,100.0,
3,aca4,Academic Integration,Attend study groups outside of the classroom,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter attending study groups ...,,99.0,
4,aca5,Academic Integration,Have informal or social contacts with faculty ...,(1) never ... (2) once a quarter ... (3) twice...,frequency this quarter talking with faculty so...,,99.0,


Weekly 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phone']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,original_count,w2_count,w3_count,w4_count,w5_count
0,att3,Attainment Value,How important to your identity is it to be kno...,"slider: 1=Not at all important, 7=Very important",important to my identity to know cooking science,,103.0,91.0,83.0,80.0
1,badgradec,Grade Expectations - Course,Think about your grade in this course...what’s...,slider: 0-100%,worst course grade still satisfactory,,106.0,92.0,85.0,82.0
2,badgradef,Grade Expectations - Final,Think about your grade on the final...what’s t...,slider: 0-100%,worst final grade still satisfactory,,,,85.0,82.0
3,badgradem,Grade Expectations - Midterm,Think about your grade on the midterm...what’s...,slider: 0-100%,worst midterm grade still satisfactory,,106.0,,,
4,badgradeq,Grade Expectations - Quiz,Think about your grade on this week's review q...,slider: 0-100%,worst quiz grade still satisfactory,,106.0,92.0,85.0,81.0


Daily 


Name Stems that are not in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phonemodel']
['Study Participation', 'phonecarrier']




Unnamed: 0,Name Stem,Construct,Item,Response Values,Label,d1_count,d2_count,d3_count,d4_count,d5_count,d6_count,original_count
0,dadd,Reflection on Previous Day Activities,Was there additional course-related activity y...,"0=No, 1=Yes",additional course activity yesterday,,72.0,77.0,77.0,75.0,56.0,
1,daddo,Reflection on Previous Day Activities,"If so, what else did you do?",open-ended,additional course activity - text,,7.0,16.0,6.0,9.0,13.0,
2,dcact1,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 1 (day 1),,,,,,,77.0
3,dcact10,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 10 (day 2),,,,,,,2.0
4,dcact11,Daily Course Activities,Please list all the course-related activities ...,open-ended,course activity 11 (day 3),,,,,,,65.0
