In [1]:
# coding=utf-8
%config InlineBackend.figure_format ='retina'
%matplotlib inline

import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

pd.set_option('display.max_rows', 10)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Set theme
sns.set_style('darkgrid')

# constants
year = 2011

state_fips = {'AK': 2, 'AL': 1, 'AR': 5, 'AS': 60, 'AZ': 4, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11,
              'DE': 10, 'FL': 12, 'GA': 13, 'GU': 66, 'HI': 15, 'IA': 19, 'ID': 16, 'IL': 17,
              'IN': 18, 'KS': 20, 'KY': 21, 'LA': 22, 'MA': 25, 'MD': 24, 'ME': 23, 'MI': 26,
              'MN': 27, 'MO': 29, 'MS': 28, 'MT': 30, 'NC': 37, 'ND': 38, 'NE': 31, 'NH': 33,
              'NJ': 34, 'NM': 35, 'NV': 32, 'NY': 36, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42,
              'PR': 72, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VA': 51,
              'VI': 78, 'VT': 50, 'WA': 53, 'WI': 55, 'WV': 54, 'WY': 56, 'all': -1}

big_10 = [145637, 147767, 151351, 153658, 163286, 170976, 171100,
          174066, 181464, 186380, 204796, 214777, 240444, 243780]

regents = [181464, 153658, 153603, 126614,
           126818, 145637, 155317, 174066,
           178396, 204796, 243780]

# local functions
def item_recode(col, codings, default_value = None):
    if default_value == None:
        answer = col.map(codings, na_action = 'ignore')
    else:
        answer = col.map(codings, na_action = 'ignore').fillna(default_value)  
    return(answer)

In [2]:
try:
    spec = 'data/nsf_{}.pickle'.format(year)
    print('Reading data for fiscal year ending {}:\n\t{}... '.format(year, spec), end='', flush=True)
    with open(spec, 'rb') as f:
        herd = pickle.load(f)
except Exception as e:
    print('ERROR.\nFile not downloaded properly.\n\n{}\n'.format(str(e)))
else:
    print('DONE.')
    herd.info()

Reading data for fiscal year ending 2011:
	data/nsf_2011.pickle... DONE.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263977 entries, 0 to 263976
Data columns (total 23 columns):
inst_id                      263977 non-null int32
year                         263977 non-null int16
ncses_inst_id                260216 non-null object
ipeds_unitid                 251188 non-null float32
hbcu_flag                    263977 non-null bool
med_sch_flag                 263977 non-null object
hhe_flag                     263977 non-null bool
toi_code                     263977 non-null int8
hdg_code                     263977 non-null int8
toc_code                     263977 non-null int8
inst_name_long               263977 non-null object
inst_city                    263977 non-null object
inst_state_code              263977 non-null object
inst_zip                     263740 non-null object
questionnaire_no             263977 non-null object
question                     263977 non-null ob

In [3]:
# set date key
date_key = '{}-06-30'.format(year)

# modify data frame to apply needed fixes
herd['date_key'] = date_key

herd['med_sch_flag'] = herd.med_sch_flag.isin(['T','TRUE', 'True', 'true', 't', 'Y', 'Yes', '1'])

herd['toi_code'] = herd.toi_code == 1

toc = {1: 'Public',
       2: 'Private'}

herd['toc_code'] = item_recode(herd['toc_code'], toc, 'Unknown')
herd['inst_state_code'] = item_recode(herd['inst_state_code'], state_fips, -1)

herd.ipeds_unitid = herd.ipeds_unitid.fillna(-1).astype(int)
herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX')

herd['data'] = herd.data.fillna(0) * 1000

herd = herd.rename(columns = {'inst_name_long': 'institution_name',
                              'ipeds_unitid': 'unitid',
                              'inst_state_code': 'state_fips',
                              'toc_code': 'control',
                              'toi_code': 'academic_institution',
                              'med_sch_flag': 'medical_school_flag',
                              'hbcu_flag': 'hbcu'})

herd

Unnamed: 0,inst_id,year,ncses_inst_id,unitid,hbcu,medical_school_flag,hhe_flag,academic_institution,hdg_code,control,...,questionnaire_no,question,row,column,data,status,othinfo,othinfo_s,standardized_agency_names,date_key
0,166,2011,U0626001,111966,False,True,True,True,2,Private,...,01.a,Source,Federal government,,20679000.0,,,,,2011-06-30
1,166,2011,U0626001,111966,False,True,True,True,2,Private,...,01.e,Source,Institution funds,,1471000.0,,,,,2011-06-30
2,166,2011,U0626001,111966,False,True,True,True,2,Private,...,01.g,Source,Total,,22150000.0,,,,,2011-06-30
3,166,2011,U0626001,111966,False,True,True,True,2,Private,...,03,Externally financed,"Grants, reimbursements, and other agreements",,20679000.0,,,,,2011-06-30
4,166,2011,U0626001,111966,False,True,True,True,2,Private,...,03,Externally financed,Total,,20679000.0,,,,,2011-06-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263972,353086,2011,XXXXXXXX,-1,False,True,False,True,1,Private,...,14H06,Capitalized equipment expenditures by field an...,"Social sciences, all",Total,40000.0,,,,,2011-06-30
263973,353086,2011,XXXXXXXX,-1,False,True,False,True,1,Private,...,14K,Capitalized equipment expenditures by field an...,All,Federal,18273000.0,,,,,2011-06-30
263974,353086,2011,XXXXXXXX,-1,False,True,False,True,1,Private,...,14K,Capitalized equipment expenditures by field an...,All,Nonfederal,1636000.0,,,,,2011-06-30
263975,353086,2011,XXXXXXXX,-1,False,True,False,True,1,Private,...,14K,Capitalized equipment expenditures by field an...,All,Total,19909000.0,,,,,2011-06-30


In [4]:
herd[['questionnaire_no', 'question']].groupby(by=['questionnaire_no', 'question']).count()

questionnaire_no,question
01.a,Source
01.b,Source
01.c,Source
01.d,Source
01.e,Source
...,...
14J09,Capitalized equipment expenditures by field and source
14K,Capitalized equipment expenditures by field and source
15,Personnel
NA_01,ARRA funds
