Retrieve an unzip data files, then append individual award data into a list. This run takes around 1 hour to finish due to large file size, but it only has to be done once if the data list is saved into a pickle for future use.

In [None]:
import urllib.request
import zipfile
import xmlschema
import os
import pickle
import numpy as np
import pandas as pd

nsf_schema = xmlschema.XMLSchema('Award.xsd')
years = list(range(2021,1999,-1))
nsf_data = []

for year in years:
    url = 'https://www.nsf.gov/awardsearch/download?DownloadFileName={}&All=true'.format(year)
    zip_name = '{}.zip'.format(year)
    folder = '{}/'.format(year)
    
    urllib.request.urlretrieve(url,zip_name)
    
    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
        zip_ref.extractall(folder)
    
    for file in os.listdir(folder): 
        filename = folder + file
        try:
            award = nsf_schema.to_dict(filename) 
            nsf_data.append(award)
        except:
            continue

In [None]:
#Save data into a pickle for future use
with open("nsf_data.pickle", "wb") as file_:
    pickle.dump(nsf_data, file_, -1)

In [2]:
#Create a blank dict with whatever features you're interested in from the award information
award_dict = {'AwardID':[],
              'AwardTitle':[],
              'AwardEffectiveDate':[],
              'AwardExpirationDate':[],
              'AwardInstrument':[],
              'MinAmdLetterDate':[],
              'MaxAmdLetterDate':[],
              'TRAN_TYPE':[],
             'AwardAmount':[],
             'AwardTotalIntnAmount':[],
             'Directorate':[],
             'Division':[],
             'Institution':[],
             'ProgramElement':[],
             'ProgramReference':[]}

for award in nsf_data:
    try:
        award_dict['AwardID'].append(award['Award']['AwardID']) #Add the information to the award_dict
    except:
        award_dict['AwardID'].append(np.NaN) #In some cases, the information is missing so a NaN is added instead
    
    try:
        award_dict['AwardTitle'].append(award['Award']['AwardTitle'])
    except:
        award_dict['AwardTitle'].append(np.NaN)
        
    try:
        award_dict['AwardEffectiveDate'].append(award['Award']['AwardEffectiveDate'])
    except:
        award_dict['AwardEffectiveDate'].append(np.NaN)
        
    try:
        award_dict['AwardExpirationDate'].append(award['Award']['AwardExpirationDate'])
    except:
        award_dict['AwardExpirationDate'].append(np.NaN)
        
    try:
        award_dict['AwardInstrument'].append(award['Award']['AwardInstrument']['Value'])
    except:
        award_dict['AwardInstrument'].append(np.NaN)
    
    try:
        award_dict['MinAmdLetterDate'].append(award['Award']['MinAmdLetterDate'])
    except:
        award_dict['MinAmdLetterDate'].append(np.NaN)
        
    try:
        award_dict['MaxAmdLetterDate'].append(award['Award']['MaxAmdLetterDate'])
    except:
        award_dict['MaxAmdLetterDate'].append(np.NaN)
        
    try:
        award_dict['TRAN_TYPE'].append(award['Award']['TRAN_TYPE'])
    except:
        award_dict['TRAN_TYPE'].append(np.NaN)
        
    try:
        award_dict['AwardAmount'].append(award['Award']['AwardAmount'])
    except:
        award_dict['AwardAmount'].append(np.NaN)
        
    try:
        award_dict['AwardTotalIntnAmount'].append(award['Award']['AwardTotalIntnAmount'])
    except:
        award_dict['AwardTotalIntnAmount'].append(np.NaN)
        
    try:
        award_dict['Directorate'].append(award['Award']['Organization']['Directorate']['LongName'])
    except:
        award_dict['Directorate'].append(np.NaN)
        
    try:
        award_dict['Division'].append(award['Award']['Organization']['Division']['LongName'])
    except:
        award_dict['Division'].append(np.NaN)
        
    try:
        award_dict['Institution'].append(award['Award']['Institution'][0]['Name'])
    except:
        award_dict['Institution'].append(np.NaN)
        
    try:
        award_dict['ProgramElement'].append(award['Award']['ProgramElement'])
    except:
        award_dict['ProgramElement'].append(np.NaN)
        
    try:
        award_dict['ProgramReference'].append(award['Award']['ProgramReference'])
    except:
        award_dict['ProgramReference'].append(np.NaN)  

nsf_df = pd.DataFrame(award_dict)

For optimal analysis, the data needs some corrections, such as inconsistent Directorate names.

In [None]:
nsf_df = nsf_df.replace('Directorate for Social, Behavioral & Economic Sciences','Direct For Social, Behav & Economic Scie')
nsf_df = nsf_df.replace('Directorate for Education & Human Resources','Direct For Education and Human Resources')
nsf_df = nsf_df.replace('Office Of Information & Resource Mgmt','Office of Information & Resource Management')
nsf_df = nsf_df.replace('Division Of Integrative Organismal Sys','Division Of Integrative Organismal Systems')
nsf_df = nsf_df.replace('Directorate for Geosciences','Directorate For Geosciences')

The time data can be converted to datetime to do time analysis

In [None]:
nsf_df['AwardEffectiveDate'] = pd.to_datetime(nsf_df['AwardEffectiveDate'])
nsf_df['AwardExpirationDate'] = pd.to_datetime(nsf_df['AwardExpirationDate'])
nsf_df['MinAmdLetterDate'] = pd.to_datetime(nsf_df['MinAmdLetterDate'])
nsf_df['MaxAmdLetterDate'] = pd.to_datetime(nsf_df['MaxAmdLetterDate'])

In [None]:
#Save the corrected dataframe into a pickle for future use
with open("nsf_df.pickle", "wb") as file_:
    pickle.dump(nsf_df, file_, -1)