In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

SHEET_NAMES = ["Pre & Post", "Weekly", "Daily"]

In [3]:
df = pd.read_csv("data/class_data.csv", encoding = 'latin-1')
df_interested = df[df["pre_studyinterest"] == "Yes"]

In [34]:
def build_counstruct_variable_dict(codebook, flag):
    appending = [c for c in list(codebook.columns)[1:list(codebook.columns).index("Construct")]]
    if "pre" in appending:
        construct_var_dict = \
            codebook.groupby("Construct").apply(
                lambda d: {i: [a + "_" + i for a in [flag] if d[d["Name Stem"] == i][flag].tolist()[0] == "x"] \
                           for i in d["Name Stem"]}).to_dict()
            
    else:
        construct_var_dict = \
            codebook.groupby("Construct").apply(
                lambda d: {i: [i + flag + str(a) for a in appending if d[d["Name Stem"] == i][a].tolist()[0] == "x"] \
                           for i in d["Name Stem"]}).to_dict()
    return construct_var_dict

def find_construct_variable(df, codebook, flag):
    construct_var_dict = build_counstruct_variable_dict(codebook, flag)
    to_ret = {construct: {var_root: [var for var in construct_var_dict[construct][var_root] + [var_root] \
                                     if var in df.columns] for var_root in construct_var_dict[construct]} \
              for construct in construct_var_dict}
    return to_ret

def print_non_exist_variable(df, to_ret):
    print("Vars that cannot be found in df:  ")
    for construct in to_ret:
        for var_root in to_ret[construct]:
            if len(to_ret[construct][var_root]) == 0:
                print([construct, var_root])
    print('\n')

def print_var_null_percentage(df, verbal = False):
    def print_percentage(construct_var_dict):
        for construct in construct_var_dict:
            found_var = [var for var_root in construct_var_dict[construct] for var in construct_var_dict[construct][var_root]]
            df_construct_sub = df[sorted(found_var)]
            to_view = pd.DataFrame(df_construct_sub.count()).reset_index()
            to_view.columns = ["var", "count"]
            to_view["null_count"] = df_construct_sub.shape[0] - to_view["count"]
            to_view["null_percentage"] = to_view["null_count"] / df_construct_sub.shape[0]
            print("{}\n{}\n\n".format(construct, to_view))

    for sheet in SHEET_NAMES:
        print(sheet, '\n\n')
        df_codebook =  pd.read_excel("data/codebook.xlsx", sheet_name=sheet)
        if sheet == "Pre & Post":
            construct_var_dict_pre = find_construct_variable(df, df_codebook, "pre")
            construct_var_dict_post = find_construct_variable(df, df_codebook, "post")
            if verbal:
                construct_var_dict = {construct: {root: construct_var_dict_pre[construct][root] + construct_var_dict_post[construct][root] 
                                      for root in construct_var_dict_pre[construct]} for construct in construct_var_dict_pre}
                print_non_exist_variable(df, construct_var_dict)
            print("Pre:  ")
            print_percentage(construct_var_dict_pre)
            print("\n\nPost:  ")
            print_percentage(construct_var_dict_post)

        else:
            construct_var_dict = find_construct_variable(df, df_codebook, sheet[0].lower())
            if verbal:
                print_non_exist_variable(df, construct_var_dict)
            print_percentage(construct_var_dict)
        
      

## Interested Students

In [35]:
print_var_null_percentage(df_interested)

Pre & Post 


Pre:  
Academic Integration
Empty DataFrame
Columns: [var, count, null_count, null_percentage]
Index: []


Achievement Goals (Mastery approach)
           var  count  null_count  null_percentage
0  pre_agqmap1     99           0              0.0
1  pre_agqmap2     99           0              0.0


Achievement Goals (Mastery avoid)
            var  count  null_count  null_percentage
0  pre_agqmap10     94           5         0.050505
1   pre_agqmap5     96           3         0.030303


Achievement Goals (Performance approach)
            var  count  null_count  null_percentage
0  pre_agqmap17     83          16         0.161616
1   pre_agqmap6     95           4         0.040404


Achievement Goals (Performance avoid)
            var  count  null_count  null_percentage
0  pre_agqmap16     90           9         0.090909
1   pre_agqmap3     96           3         0.030303
2   pre_agqmap7     95           4         0.040404


Achievement Goals (Work avoidance)
            v

## All Students

In [5]:
print_var_null_percentage(df)

Pre & Post 


Vars that cannot be found in df:  
['Qualtrics Embedded Data', 'ip']
['Qualtrics Embedded Data', 'responseid']
['Qualtrics Embedded Data', 'recipientlastname']
['Qualtrics Embedded Data', 'recipientfirstname']
['Qualtrics Embedded Data', 'recipientemail']
['Self-Concept of Ability', 'abil_gen_bio1']
['Self-Concept of Ability', 'abil_gen_bio3']
['Self-Concept of Ability', 'abil_gen_chem1']
['Self-Concept of Ability', 'abil_gen_chem3']
['Student Entered Data', 'firstname']
['Student Entered Data', 'lastname']
['Student Entered Data', 'studentid']
['Student Entered Data', 'email']
['Study Participation', 'phone']


Academic Integration
         var  count  null_count  null_percentage
0  post_aca1    100          69         0.408284
1  post_aca2    100          69         0.408284
2  post_aca3    100          69         0.408284
3  post_aca4     99          70         0.414201
4  post_aca5     99          70         0.414201

Achievement Goals (Mastery approach)
            v