In [16]:
%config InlineBackend.figure_format ='retina'
%matplotlib inline

import user_functions

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

# Set options
sns.set_style('darkgrid')
pd.set_option("display.max_rows", 10)

# set values of constants
start_year = 2007
end_year = 2017
years = [i for i in np.arange(start_year, end_year + 1)]

# local functions
def read_file(year, types = "object"):
    file_spec = "hd{}.zip".format(year)
    url = "https://nces.ed.gov/ipeds/datacenter/data/{}".format(file_spec)
    answer = user_functions.net_load_data(url, types)
    answer = user_functions.fix_cols(answer)
    answer["institution_key"] = answer["unitid"].astype(int)
    answer["date_key"] = (year * 10000) + 1015
    return(answer)

In [23]:
df = DataFrame()

types = {"fips": np.float32,
         "longitud": np.float32,
         "latitude": np.float32}

for year in years:
    temp = read_file(year, "object")
    df = pd.concat([df, temp],
                   sort = True)
    temp = None

print("{:,} rows and {} columns read".format(df.shape[0], df.shape[1]))

82,147 rows and 84 columns read


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82147 entries, 0 to 7152
Data columns (total 84 columns):
act                82147 non-null object
addr               82076 non-null object
adminurl           77743 non-null object
applurl            77555 non-null object
athurl             30008 non-null object
c15basic           22321 non-null object
c15enprf           22321 non-null object
c15ipgrd           22321 non-null object
c15ipug            22321 non-null object
c15szset           22321 non-null object
c15ugprf           22321 non-null object
carnegie           82147 non-null object
cbsa               82147 non-null object
cbsatype           82147 non-null object
ccbasic            82147 non-null object
ccenrprf           59826 non-null object
ccipgrad           59826 non-null object
ccipug             59826 non-null object
ccsizset           59826 non-null object
ccugprof           59826 non-null object
chfnm              82049 non-null object
chftitle           81985 non-nu

In [26]:
df.longitud = pd.to_numeric(df.longitud.str.replace("[^0-9\.\-]|^\.$", ""), errors = "coerce")
df.latitude = pd.to_numeric(df.latitude.str.replace("[^0-9\.\-]|^\.$", ""), errors = "coerce")

df

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [28]:
names = {"longitud":"longitude",
         "fips": "state_fips",
         "countycd": "county_fips",
         "instnm": "institution_name",
         "stabbr": "state"}

# key columns
indices = ["institution_key", "date_key"]

# value columns
cols = ["institution_name", "control", "state", "state_fips", "county_fips", "cbsa", "cbsatype",
        "longitude", "latitude", "locale"]

df.rename(columns = names)[indices + cols]

Unnamed: 0,institution_key,date_key,institution_name,control,state,state_fips,county_fips,cbsa,cbsatype,longitude,latitude,locale
0,100636,20071015,Community College of the Air Force,1,AL,1,,33860,1,,,12
1,100654,20071015,Alabama A & M University,1,AL,1,,26620,1,,,12
2,100663,20071015,University of Alabama at Birmingham,1,AL,1,,13820,1,,,12
3,100690,20071015,Amridge University,2,AL,1,,33860,1,,,12
4,100706,20071015,University of Alabama in Huntsville,1,AL,1,,26620,1,,,12
...,...,...,...,...,...,...,...,...,...,...,...,...
7148,491385,20171015,Wright Institute (The) -,-3,CA,6,6001,41860,1,-122.291518,37.860543,12
7149,491394,20171015,Edge Tech Academy,3,TX,48,48439,19100,1,-97.196111,32.834122,21
7150,491419,20171015,Health Career Institute- Lauderdale Lakes,-3,FL,12,12011,33100,1,-80.216743,26.163240,21
7151,491428,20171015,National University College - Mayaguez Campus,-3,PR,72,72097,32420,1,-67.149938,18.184231,13
