In [44]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

In [45]:
codebook_xl = pd.ExcelFile('Datasets/codebook_publicv4.xlsx')
codebook_df = codebook_xl.parse('Sheet1', header=3, usecols=[0,1,3,4,5], 
                                nrows =758, index_col = 0, skiprows =[1])
codebook_df.columns = ['type','variable_description','response_values', 'response_labels']

In [46]:
variables = codebook_df.index.values
descriptions = codebook_df.variable_description.values
var_map = {var:None for var in variables}
var_descriptions = dict(zip(variables,descriptions))

In [47]:
labels = codebook_df.response_labels
label_strings = [l.replace('\n','; ') if type(l) is str else l for l in labels ]
var_labels = dict(zip(labels.index,label_strings))

In [48]:
#Create mask for response_values that are strings in the dataframe
is_string = codebook_df['response_values'].apply(isinstance,args=(str,))

#response_values that are strings in the dataframe
response_values = codebook_df[is_string]['response_values']

#Corresponding label_set for the response_values 
label_sets = codebook_df[is_string]['response_labels']

#Variables with string response_values in the dataframe
str_response_vars = codebook_df[is_string]['response_labels'].index

In [49]:
#Split response_values strings into a list of values and create list of value sets for each row
val_list = [entry.split('\n') for entry in response_values.values]

#Split response_labels strings into a list labels and create list of label sets for each row
lab_list = [entry.split('\n') for entry in label_sets.values]

#Create a list of dictionaries with response variables as keys and the corressponding labels as values
label_map = [dict(zip(vals,labels)) for vals,labels in zip(val_list,lab_list)]

#Assign the dictionary of response values and labels to the correspnign variable
for var,lab in zip(str_response_vars,label_map):
    var_map[var] = lab

In [50]:
#Create mask for null response_values
null_values = codebook_df['response_values'].isnull()

#Create mask for null response_labels
null_labels = codebook_df['response_labels'].isnull()

#Create mask for null numeric variables
is_numeric = codebook_df['type'] =='Numeric'

#Create mask for null character variables
is_character = codebook_df['type'] =='Character'

In [51]:
numeric_labels = codebook_df[is_numeric & np.invert(null_labels) & null_values].response_labels
non_null_labels = codebook_df[is_numeric & np.invert(null_labels) & null_values].index

for var,label in zip(non_null_labels,numeric_labels):
    var_map[var] = {'Numeric':label}

In [52]:
null_character = codebook_df.loc[is_character & null_values].index
label_str = codebook_df.loc[is_character & null_values]['response_labels'].values
label_list = label_str[0].split('\n')
var_map[null_character.values[0]] = dict(zip(label_list,label_list))

In [53]:
with open('variable_map.pickle', 'wb') as handle:
    pickle.dump(var_map, handle)

In [54]:
with open('variable_descriptions.pickle', 'wb') as handle:
    pickle.dump(var_descriptions, handle)

In [55]:
with open('variable_labels.pickle', 'wb') as handle:
    pickle.dump(var_labels, handle)