In [54]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

CDF = pd.read_csv("censusDataAGE15-24.csv")
CDF.rename(columns = {"countyFIPS":"fips"}, inplace = True)
CDF['fips'] = CDF.apply(lambda row: str(row.fips).zfill(5), axis=1)
CDF['fips'] = CDF['fips'].astype(int)

ICT = pd.read_csv("IC_PY_2015-2021_data.csv")
ICTU = ICT[ICT["year"]==2015]
ICTIMP = ICTU[["unitid","chg1py3"]]
ICTIMP = ICTIMP[ICTIMP["chg1py3"] != "."]

averagetuition = (ICTIMP["chg1py3"].astype(int)).mean()


CDF['WR'] = CDF['White']/CDF['TOT_POP']
CDF['BAR'] = CDF['Black or African American']/CDF['TOT_POP']
CDF['HR'] = CDF['Hispanic']/CDF['TOT_POP']
CDF['AIANR'] = CDF['American Indian or Alaska Native']/CDF['TOT_POP']
CDF['AR'] = CDF['Asian']/CDF['TOT_POP']
CDF['NHPIR'] = CDF['Native Hawaiian or Pacific Islander']/CDF['TOT_POP']
CDF['TOMR'] = CDF['Two or More Races']/CDF['TOT_POP']

ratios1 = ['WR', 'BAR', 'HR', 'AIANR', 'AR', 'NHPIR', 'TOMR']

FER = pd.read_csv("EFA_2015-2020_data.csv")  ## Fall enrollment 

is_totalEnrollment =  FER['line']=="Total enrollment"
FERU = FER[is_totalEnrollment].copy()

is_2015 = FERU['year']==2015
FE = FERU[is_2015] ## Unique total fall enrollment statistics for 2015 

FIPS = pd.read_csv("FIPS.csv", encoding='latin-1')
FIPS.drop(['county','state'], inplace=True, axis=1)
FIPS.rename(columns = {"county_name":"county", "state_name":"state"}, inplace = True)
FIPS = FIPS[["fips", "county", "state"]]
FIPS["fips"] = FIPS.apply(lambda row: str(row.fips).zfill(5), axis=1)
FIPS.head()

IC = pd.read_csv("HD_2015-2021_data.csv")
IC = IC.rename(columns={'fips': 'state', 'countynm': 'county'})
IC = IC[["unitid", "year", "county", "state", "countycd", "longitud", "latitude"]]
IC.drop_duplicates(subset="unitid", keep='first', inplace=True)  # drop older years
IC.drop('year', inplace=True, axis=1)
FIPS.head()

UNITIDFIPS = pd.merge(FIPS, IC, on=["county", "state"])

FE = FE.merge(UNITIDFIPS, on='unitid')

cols = ['efaiant', 'efasiat', 'efbkaat', 'efhispt', 'efnhpit','efwhitt','ef2mort','efunknt','efnralt']

FE['utot_localpop'] = FE['eftotlt'] - FE['efnralt'] - FE['efunknt']
FE['UWR'] = FE['efwhitt']/FE['utot_localpop']
FE['UBAR'] = FE['efbkaat']/FE['utot_localpop']
FE['UHR'] = FE['efhispt']/FE['utot_localpop']
FE['UAIANR'] = FE['efaiant']/FE['utot_localpop']
FE['UAR'] = FE['efasiat']/FE['utot_localpop']
FE['UNHPIR'] = FE['efnhpit']/FE['utot_localpop']
FE['UTOMR'] = FE['ef2mort']/FE['utot_localpop']

ratios2 = ['UWR', 'UBAR', 'UHR', 'UAIANR', 'UAR', 'UNHPIR', 'UTOMR']
FEL = FE[ratios2].sum(axis=1)

FE = FE[['unitid', 'fips'] + ratios2]
FE['fips'] = FE['fips'].astype(int)

def diversity(values): 
    sum = 0 
    for x in values: 
        proportion = x
        if (proportion > 0): 
            sum = sum + (-1)*(proportion)*np.log(proportion)
    return sum 

def KLdivergence(values): 
    sum = 0 
    values1 = values[:7] ##County Ratios
    values2 = values[7:14] ##University ratios
    for x in range(7):
        sum = sum - values2[x]*np.log(values2[x]/values1[x])
    return sum 

def chisqp(values): 
    f_exp = values[:7]  ##Expected
    f_obs = values[7:14]  ##Oberserved
    (s,p) = stats.chisquare(f_obs, f_exp)
    return p
    

CDF['County Shannon Diversity Ethnicity'] = CDF[ratios1].apply(diversity, axis =1)
CDF['County True Diversity Ethnicity'] = CDF['County Shannon Diversity Ethnicity'].apply(np.exp, axis=1)

FE['University Shannon Diversity Ethnicity'] = FE[ratios2].apply(diversity, axis =1)
FE['University True Diversity Ethnicity'] = FE['University Shannon Diversity Ethnicity'].apply(np.exp, axis =1)


FET = FE.merge(CDF, on='fips')
FET['True Diversity Difference'] = FET['University True Diversity Ethnicity'] - FET['County True Diversity Ethnicity']
FET['KLDivergence'] = FET[ratios1 + ratios2].apply(KLdivergence, axis=1)
FET['CHSQGOF'] = FET[ratios1 + ratios2].apply(chisqp, axis=1)
FETF = FET[['unitid'] + ratios1 + ratios2 + ['County Shannon Diversity Ethnicity', 'County True Diversity Ethnicity', 'University Shannon Diversity Ethnicity', 'University True Diversity Ethnicity', 'KLDivergence', 'True Diversity Difference', 'CHSQGOF']]

SIG = FETF[FETF['CHSQGOF']<0.05]
SIGT = SIG.merge(ICTIMP, on='unitid')
averagesigt = (SIGT["chg1py3"].astype(int)).mean()
SIG.to_csv("SignificantINST.csv", encoding="utf-8")

STATE = pd.read_csv("F_F1A_1415-1920_data_New.csv")
PRVNP = pd.read_csv("F_F2_1415-1920_data_New.csv")
PRVFP = pd.read_csv("F_F3_1415-1920_data_New.csv")

financecols = ['unitid', 'NOI', '%Tuition', "%Pell", "Presence_Endowment"]

PRVNP = PRVNP[PRVNP['year']==1415]
PRVFP = PRVFP[PRVFP['year']==1415]
STATE = STATE[STATE['year']==1415]
PRVNP = PRVNP[financecols]
STATE = STATE[financecols]
PRVFP = PRVFP[['unitid', 'NOI', '%Tuition', "%Pell"]]
FINANCE = pd.concat([PRVNP, PRVFP, STATE])
FINANCE = FINANCE[FINANCE['%Pell'] != '#DIV/0!']

averagePell = (FINANCE["%Pell"].astype(float)).mean()



FETFENHANCED = FETF.merge(FINANCE, on = 'unitid')
SIGN = FETFENHANCED[FETFENHANCED['CHSQGOF']<0.05]
averagePellSig = (FETFENHANCED["%Pell"].astype(float)).mean()
FETFENHANCED.to_csv("DiversityDATA.csv", encoding="utf-8") 








  IC = pd.read_csv("HD_2015-2021_data.csv")
  sum = sum - values2[x]*np.log(values2[x]/values1[x])
  sum = sum - values2[x]*np.log(values2[x]/values1[x])
  sum = sum - values2[x]*np.log(values2[x]/values1[x])
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  STATE = pd.read_csv("F_F1A_1415-1920_data_New.csv")
  PRVNP = pd.read_csv("F_F2_1415-1920_data_New.csv")


ValueError: could not convert string to float: '#DIV/0!'