In [1]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import numpy as np

import pandas as pd
from pandas import DataFrame, Series

def get_filename(file_list):
    match = [s for s in file_list if "_rv" in s]
    answer = file_list[0]
    if len(match) > 0:
        answer = match[0]
    return(answer)

def net_load_info(url):
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    files = zipfile.namelist()
    return(files)

def net_load_data(url):
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    file_name = get_filename(zipfile.namelist())
    data_file = zipfile.open(file_name)
    answer = pd.read_csv(data_file,
                         low_memory = False,
                         encoding = "iso-8859-1")
    
    return(answer)

def fix_cols(dat):
    for col in dat:
        col = col.lower()
        dat[col] = pd.to_numeric(dat[col], errors = "coerce", downcast = "integer")
    return(dat)

def read_file(year, keepers):
    file_spec = np.where(year < 2014, "ic" + str(year) + ".zip", "adm" + str(year) + ".zip")
    answer = net_load_data("https://nces.ed.gov/ipeds/datacenter/data/" + str(file_spec))
    answer.columns = [colname.lower() for colname in list(answer.columns.values)]
    answer = fix_cols(answer)
    answer = answer[keepers]
    return(answer)

# value columns
cols = ["unitid", "applcn", "applcnm", "applcnw", "admssn", "admssnm", "admssnw","enrlt", "enrlm", "enrlw"]

# years to pull
years = range(2007, 2009)

In [2]:
for year in years:
    file_spec = np.where(year < 2014, "ic" + str(year) + ".zip", "adm" + str(year) + ".zip")
    contents = net_load_info("https://nces.ed.gov/ipeds/datacenter/data/" + str(file_spec))
    for item in contents:
        print(item)

ic2007.csv
ic2008.csv
ic2008_rv.csv


In [3]:
for year in years:
    file_spec = np.where(year < 2014, "ic" + str(year) + ".zip", "adm" + str(year) + ".zip")
    contents = net_load_info("https://nces.ed.gov/ipeds/datacenter/data/" + str(file_spec))
    file_name = get_filename(contents)
    print(file_name)

ic2007.csv
ic2008_rv.csv


In [4]:
df = DataFrame()

for year in years:
    temp = read_file(year, cols)
    temp = fix_cols(temp)
    df = pd.concat([df, temp],
                   sort = True)
    temp = None

df = df.fillna(0)

df.head(20)

Unnamed: 0,admssn,admssnm,admssnw,applcn,applcnm,applcnw,enrlm,enrlt,enrlw,unitid
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100636
1,2059.0,799.0,1260.0,6470.0,2507.0,3963.0,408.0,885.0,477.0,100654
2,3705.0,1489.0,2216.0,4221.0,1674.0,2547.0,612.0,1531.0,919.0,100663
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100690
4,1628.0,845.0,783.0,1850.0,921.0,929.0,424.0,800.0,376.0,100706
5,5172.0,3214.0,1958.0,12436.0,7353.0,5083.0,589.0,1366.0,777.0,100724
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100733
7,9140.0,3908.0,5232.0,14313.0,6185.0,8128.0,2042.0,4538.0,2496.0,100751
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100760
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100812


In [5]:
df.dtypes

admssn     float64
admssnm    float64
admssnw    float64
applcn     float64
applcnm    float64
applcnw    float64
enrlm      float64
enrlt      float64
enrlw      float64
unitid       int32
dtype: object