In [1]:
# coding=utf-8
%config InlineBackend.figure_format ='retina'
%matplotlib inline

import numpy as np
import pandas as pd
from pandas import DataFrame

import pickle

from sqlalchemy import sql

from base import engine, Session, Base
from date_dimension import DateRow
from states import State
from nsf_herd_institution_data import NsfHerdInstitution
from nsf_herd_detail_data import NsfHerdDetail

pd.set_option('display.max_rows', 10)

# constants
year = 2018

state_fips = {'AK': 2, 'AL': 1, 'AR': 5, 'AS': 60, 'AZ': 4, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11,
              'DE': 10, 'FL': 12, 'GA': 13, 'GU': 66, 'HI': 15, 'IA': 19, 'ID': 16, 'IL': 17,
              'IN': 18, 'KS': 20, 'KY': 21, 'LA': 22, 'MA': 25, 'MD': 24, 'ME': 23, 'MI': 26,
              'MN': 27, 'MO': 29, 'MS': 28, 'MT': 30, 'NC': 37, 'ND': 38, 'NE': 31, 'NH': 33,
              'NJ': 34, 'NM': 35, 'NV': 32, 'NY': 36, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42,
              'PR': 72, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VA': 51,
              'VI': 78, 'VT': 50, 'WA': 53, 'WI': 55, 'WV': 54, 'WY': 56, 'all': -1}

big_10 = [145637, 147767, 151351, 153658, 163286, 170976, 171100,
          174066, 181464, 186380, 204796, 214777, 240444, 243780]

regents = [181464, 153658, 153603, 126614,
           126818, 145637, 155317, 174066,
           178396, 204796, 243780]

# local functions
def item_recode(col, codings):
    answer = col.map(codings, na_action = 'ignore')    
    return(answer)

In [2]:
try:
    spec = 'data/nsf_{}.pickle'.format(year)
    print('Reading data for fiscal year ending {}:\n\t{}... '.format(year, spec), end='', flush=True)
    with open(spec, 'rb') as f:
        herd = pickle.load(f)
except Exception as e:
    print('ERROR.\nFile not downloaded properly.\n\n{}\n'.format(str(e)))
else:
    print('DONE.')
    herd.info()

Reading data for fiscal year ending 2018:
	data/nsf_2018.pickle... DONE.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238522 entries, 0 to 238521
Data columns (total 23 columns):
inst_id                      238522 non-null int32
year                         238522 non-null int16
ncses_inst_id                235464 non-null object
ipeds_unitid                 224481 non-null float32
hbcu_flag                    238522 non-null bool
med_sch_flag                 238522 non-null object
hhe_flag                     238522 non-null bool
toi_code                     238522 non-null int8
hdg_code                     238522 non-null int8
toc_code                     238522 non-null int8
inst_name_long               238522 non-null object
inst_city                    238522 non-null object
inst_state_code              238522 non-null object
inst_zip                     238522 non-null object
questionnaire_no             238522 non-null object
question                     238522 non-null ob

In [3]:
# set date key
date_key = '{}-06-30'.format(year)

# modify data frame to apply needed fixes
herd['date_key'] = date_key

herd['med_sch_flag'] = herd.med_sch_flag.isin(['T','TRUE', 'True', 'true', 't', 'Y', 'Yes', '1'])

herd['toi_code'] = herd.toi_code == 1

toc = {1: 'Public',
       2: 'Private'}

herd['toc_code'] = item_recode(herd['toc_code'], toc)
herd['inst_state_code'] = item_recode(herd['inst_state_code'], state_fips)

herd = herd[herd.questionnaire_no.isin(['01.a', '01.b', '01.c', '01.d', '01.e', '01.f', '01.g', '04', 'NA_01'])]

herd.ipeds_unitid = herd.ipeds_unitid.fillna(-1).astype(int)
herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX')

herd['data'] = herd.data.fillna(0) * 1000

herd

Unnamed: 0,inst_id,year,ncses_inst_id,ipeds_unitid,hbcu_flag,med_sch_flag,hhe_flag,toi_code,hdg_code,toc_code,...,questionnaire_no,question,row,column,data,status,othinfo,othinfo_s,standardized_agency_names,date_key
4,166,2018,U0626001,111966,False,True,True,True,2,Private,...,01.a,Source,Federal government,,10198000.0,,,,,2018-06-30
5,166,2018,U0626001,111966,False,True,True,True,2,Private,...,01.d,Source,Nonprofit organizations,,400000.0,,,,,2018-06-30
6,166,2018,U0626001,111966,False,True,True,True,2,Private,...,01.e,Source,Institution funds,,892000.0,,,,,2018-06-30
7,166,2018,U0626001,111966,False,True,True,True,2,Private,...,01.g,Source,Total,,11490000.0,,,,,2018-06-30
11,166,2018,U0626001,111966,False,True,True,True,2,Private,...,04,Medical school expenditures,Total,,1724000.0,,,,,2018-06-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237936,353086,2018,XXXXXXXX,-1,False,True,False,True,1,Private,...,01.d,Source,Nonprofit organizations,,29005000.0,,,,,2018-06-30
237937,353086,2018,XXXXXXXX,-1,False,True,False,True,1,Private,...,01.e,Source,Institution funds,,188139008.0,,,,,2018-06-30
237938,353086,2018,XXXXXXXX,-1,False,True,False,True,1,Private,...,01.f,Source,All other sources,,2665000.0,,,,,2018-06-30
237939,353086,2018,XXXXXXXX,-1,False,True,False,True,1,Private,...,01.g,Source,Total,,738620032.0,,,,,2018-06-30


In [4]:
keepers = ['inst_id',
           'date_key',
           'ncses_inst_id',
           'ipeds_unitid',
           'inst_name_long',
           'inst_state_code',
           'med_sch_flag',
           'questionnaire_no',
           'data']

institutions = herd[keepers]

# long to wide
institutions = institutions.pivot_table(index = ['inst_id',
                                                 'date_key',
                                                 'ncses_inst_id',
                                                 'ipeds_unitid',
                                                 'inst_name_long',
                                                 'inst_state_code',
                                                 'med_sch_flag'],
                                        columns = 'questionnaire_no',
                                        values = 'data',
                                        fill_value = 0).reset_index()

institutions

questionnaire_no,inst_id,date_key,ncses_inst_id,ipeds_unitid,inst_name_long,inst_state_code,med_sch_flag,01.a,01.b,01.c,01.d,01.e,01.f,01.g,04
0,166,2018-06-30,U0626001,111966,Charles R. Drew University of Medicine and Sci...,6,True,10198000,0,0,400000,892000,0,11490000,1724000
1,1002,2018-06-30,U0047001,100654,Alabama A&M University,1,False,24330000,773000,202000,0,5536000,2683000,33524000,0
2,1005,2018-06-30,U0049001,100724,Alabama State University,1,False,2434000,0,0,0,0,0,2434000,0
3,1009,2018-06-30,U0323002,100858,"Auburn University, Auburn",1,False,58488000,32070000,5489000,1390000,105786000,9702000,212924992,0
4,1020,2018-06-30,U1533001,101480,Jacksonville State University,1,False,2794000,0,17000,0,197000,0,3008000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,330008,2018-06-30,U3418001,137351,"University of South Florida, Tampa",12,True,263388992,31373000,23631000,37630000,218927008,6611000,581561024,209140000
642,330010,2018-06-30,R2936005,-1,"Southern University and A&M College, Agricultu...",22,False,3057000,2173000,0,0,66000,0,5296000,0
643,330050,2018-06-30,S0686018,190035,City University of New York system office,36,False,1977000,739000,35000,70000,340000,92000,3253000,0
644,330051,2018-06-30,S3338001,166665,University of Massachusetts central office,25,False,643000,3309000,266000,755000,318000,7000,5298000,0


In [8]:
# rename columns
institutions = institutions.rename(columns = {'inst_name_long': 'institution_name',
                                              'ipeds_unitid': 'unitid',
                                              'inst_state_code': 'state_fips',
                                              'toc_code': 'control',
                                              'toi_code': 'academic_institution',
                                              'med_sch_flag': 'medical_school_flag',
                                              '01.a': 'federal_government',
                                              '01.b': 'state_and_local_government',
                                              '01.c': 'business',
                                              '01.d': 'nonprofit_organizations',
                                              '01.e': 'institutional_funds',
                                              '01.f': 'other_sources',
                                              '01.g': 'total_rd_expenses',
                                              '04': 'medical_school_expenses',
                                              'NA_01': 'arra_funds'})

institutions

questionnaire_no,inst_id,date_key,ncses_inst_id,unitid,institution_name,state_fips,medical_school_flag,federal_government,state_and_local_government,business,nonprofit_organizations,institutional_funds,other_sources,total_rd_expenses,medical_school_expenses
0,166,2016-06-30,U0626001,111966.0,Charles R. Drew University of Medicine and Sci...,6,True,12089000.0,,,420000.0,851000.0,,13360000.0,2005000.0
1,1002,2016-06-30,U0047001,100654.0,Alabama A&M University,1,False,22478000.0,657000.0,171000.0,,4703000.0,2280000.0,30289000.0,
2,1005,2016-06-30,U0049001,100724.0,Alabama State University,1,False,2372000.0,,,,,,2372000.0,
3,1009,2016-06-30,U0323002,100858.0,"Auburn University, Auburn",1,False,54751000.0,37061000.0,5692000.0,1589000.0,46529000.0,6759000.0,152380992.0,
4,1020,2016-06-30,U1533001,101480.0,Jacksonville State University,1,False,3081000.0,,40000.0,,110000.0,,3231000.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,330007,2016-06-30,U3450002,234076.0,"University of Virginia, Charlottesville",51,True,210980000.0,4366000.0,21470000.0,28978000.0,95103000.0,36561000.0,397457984.0,211348000.0
604,330008,2016-06-30,U3418001,137351.0,"University of South Florida, Tampa",12,True,228364992.0,27561000.0,20732000.0,24952000.0,199912000.0,4443000.0,505964992.0,187968992.0
605,330038,2016-06-30,U0793001,456542.0,"Commonwealth Medical College, The",42,True,396000.0,,,1000.0,1766000.0,45000.0,2208000.0,2208000.0
606,330050,2016-06-30,S0686018,190035.0,City University of New York system office,36,False,832000.0,,,131000.0,587000.0,105000.0,1655000.0,


In [9]:
institutions = institutions.fillna(sql.null())

institutions

questionnaire_no,inst_id,date_key,ncses_inst_id,unitid,institution_name,state_fips,medical_school_flag,federal_government,state_and_local_government,business,nonprofit_organizations,institutional_funds,other_sources,total_rd_expenses,medical_school_expenses
0,166,2016-06-30,U0626001,111966.0,Charles R. Drew University of Medicine and Sci...,6,True,12089000.0,0.0,0.0,420000.0,851000.0,0.0,13360000.0,2005000.0
1,1002,2016-06-30,U0047001,100654.0,Alabama A&M University,1,False,22478000.0,657000.0,171000.0,0.0,4703000.0,2280000.0,30289000.0,0.0
2,1005,2016-06-30,U0049001,100724.0,Alabama State University,1,False,2372000.0,0.0,0.0,0.0,0.0,0.0,2372000.0,0.0
3,1009,2016-06-30,U0323002,100858.0,"Auburn University, Auburn",1,False,54751000.0,37061000.0,5692000.0,1589000.0,46529000.0,6759000.0,152380992.0,0.0
4,1020,2016-06-30,U1533001,101480.0,Jacksonville State University,1,False,3081000.0,0.0,40000.0,0.0,110000.0,0.0,3231000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,330007,2016-06-30,U3450002,234076.0,"University of Virginia, Charlottesville",51,True,210980000.0,4366000.0,21470000.0,28978000.0,95103000.0,36561000.0,397457984.0,211348000.0
604,330008,2016-06-30,U3418001,137351.0,"University of South Florida, Tampa",12,True,228364992.0,27561000.0,20732000.0,24952000.0,199912000.0,4443000.0,505964992.0,187968992.0
605,330038,2016-06-30,U0793001,456542.0,"Commonwealth Medical College, The",42,True,396000.0,0.0,0.0,1000.0,1766000.0,45000.0,2208000.0,2208000.0
606,330050,2016-06-30,S0686018,190035.0,City University of New York system office,36,False,832000.0,0.0,0.0,131000.0,587000.0,105000.0,1655000.0,0.0


In [10]:
# insert data into dbo.survey_records
session = Session()

try:
    print('Attempting to insert {:,} rows for {} into {}.'.format(institutions.shape[0], year, NsfHerdInstitution.__tablename__))
    record_deletes = session.query(NsfHerdInstitution).filter(NsfHerdInstitution.date_key==date_key).delete(synchronize_session=False)
    session.bulk_insert_mappings(mapper = NsfHerdInstitution,
                                 mappings = institutions.to_dict(orient='records'),
                                 render_nulls = True)
except Exception as e:
    session.rollback()
    print(str(e))
    print('No data were altered due to error.')
else:
    session.commit()
    print('\n{:,} old records were deleted.'.format(record_deletes))
    print('{:,} new records were inserted.'.format(institutions.shape[0]))
finally:
    session.close()
    session = None

Attempting to insert 608 rows for 2016 into nsf_herd_institution_data.

0 old records were deleted.
608 new records were inserted.
