In [1]:
import pandas as pd, re
import numpy as np
import getschools as gs

In [33]:
def get_df(group):
    
    ret = pd.read_csv("../clean/csv/collegeEntrancePersistence-"+group+".csv")
    
    
    for c in [u'2009_10_entrance_pct','2009_10_persistence_pct',
       u'2010_11_entrance_pct', u'2010_11_persistence_pct',
       u'2011_12_entrance_pct', u'2011_12_persistence_pct',
       u'2012_13_entrance_pct', u'2012_13_persistence_pct',
       u'2013_14_entrance_pct', u'2013_14_persistence_pct']:
        
        ret[c] = pd.to_numeric(ret[c],errors="coerce")

    ret["school_code"] = ret["school_code"].astype(str).str.zfill(7)
    ret["district_code"] = ret["district_code"].astype(str).str.zfill(7)

    ret = gs.add_drg(ret)
    
    def four_year(row):
        non_nulls = 0
        avgs = []
        for c in ['2009_10_persistence_pct',
                  '2010_11_persistence_pct',
                  '2011_12_persistence_pct',
                  '2012_13_persistence_pct',
                  '2013_14_persistence_pct']:
            
            if row[c] is None: 
                continue
                
            non_nulls += 1
            avgs += [row[c]]
                
        if len(avgs) == 0 or non_nulls == 0:
            return None
        
        return np.nanmedian(avgs)
                
    ret["four_y_percent"] = ret.apply(four_year,axis=1)        
    
    ret = ret[ret["four_y_percent"].notnull()]
    return ret

get_df("all")

Unnamed: 0,district,district_code,school,school_code,2009_10_entrance_pct,2009_10_persistence_pct,2010_11_entrance_pct,2010_11_persistence_pct,2011_12_entrance_pct,2011_12_persistence_pct,2012_13_entrance_pct,2012_13_persistence_pct,2013_14_entrance_pct,2013_14_persistence_pct,group,drg,four_y_percent
185,Weston School District,1570011,Weston High School,1576111,88.8,97.8,87.5,97.0,86.2,93.2,90.1,97.3,90.9,96.9,all,A,97.00
30,Darien School District,0350011,Darien High School,0356111,83.2,94.3,81.6,96.1,86.6,97.3,90.2,97.8,89.3,96.9,all,A,96.90
188,Wilton School District,1610011,Wilton High School,1616111,85.8,96.6,87.5,96.6,89.4,97.1,85.8,96.7,87.7,96.9,all,A,96.70
186,Westport School District,1580011,Staples High School,1586111,84.4,96.7,79.0,96.2,85.5,96.4,88.5,97.3,90.8,96.5,all,A,96.50
80,Madison School District,0760011,Daniel Hand High School,0766111,83.9,93.7,85.7,96.2,88.7,96.5,86.8,96.8,88.2,96.4,all,B,96.40
139,Ridgefield School District,1180011,Ridgefield High School,1186111,87.2,96.3,86.0,95.0,88.9,94.6,87.9,96.3,88.5,97.0,all,A,96.30
99,New Canaan School District,0900011,New Canaan High School,0906111,81.0,94.0,78.8,95.9,86.7,95.7,87.0,96.9,87.8,94.9,all,A,95.70
36,East Hartford School District,0430011,Connecticut IB Academy,0436311,82.1,95.7,81.8,85.2,100.0,100.0,94.6,100.0,95.6,95.3,all,H,95.70
51,Glastonbury School District,0540011,Glastonbury High School,0546111,84.6,94.9,83.9,96.2,88.1,95.2,88.0,96.2,85.6,95.0,all,B,95.20
204,Regional School District 08,2080012,RHAM High School,2086112,83.6,96.1,84.7,93.3,86.4,93.8,79.3,95.3,83.2,95.2,all,C,95.20


In [74]:
def schools_df():
    all_df = get_df("all")
    all_df["race_ethnicity"] = "All"
    
    race_df = get_df("race")
    
    return pd.concat([race_df, all_df])

schools_df().to_csv("../clean/for_db/college_pers.csv",index=False)


In [72]:
def drg_report(df,groups=[]):
    
    ret  = df.copy()
    
    ret = ret.groupby(["drg"] + groups).agg({
            "four_y_percent":np.nanmedian
        }).reset_index()
    
    return ret

drg_report(schools_df(),groups=["race_ethnicity"]).to_csv("../clean/for_db/drg_college_pers.csv",index=False)

In [31]:
drg_report(get_df("race"),groups=["race_ethnicity"])

Unnamed: 0,drg,race_ethnicity,four_y_percent
0,A,White,96.6
1,B,Asian,96.9
2,B,Black or African American,91.3
3,B,Hispanic or Latino,88.6
4,B,White,94.4
5,C,White,92.4
6,D,Black or African American,81.3
7,D,Hispanic or Latino,82.9
8,D,White,91.15
9,E,White,89.9


In [75]:
def state_report(df,groups=[]):
    
    ret  = df.copy()
    
    if len(groups) > 0:
        ret = ret.groupby(groups).agg({
                "four_y_percent":np.nanmedian
        }).reset_index()
    
    
    
    return ret#["four_y_percent"].median
state_report(schools_df(),groups=["race_ethnicity"]).to_csv("../clean/for_db/state_college_pers.csv",index=False)
#state_report(get_df("race"),groups=["race_ethnicity"])

In [63]:
def full_df():
    
    school_race_df = get_df("race")
    school_all_df = get_df("all")
    school_all_df["race_ethnicity"] = "All"
    
    df = pd.concat([school_race_df,school_all_df])
    
    drg_df = drg_report(df,groups=["race_ethnicity"])
    
    state_df = state_report(df,groups=["race_ethnicity"])
    
    def drg_val(row):
        matches = drg_df[(drg_df["drg"] == row["drg"]) & (drg_df["race_ethnicity"] == row["race_ethnicity"])]
        if len(matches) != 1:
            return None
        
        #print matches.iloc[0]
        return matches.iloc[0]["four_y_percent"]
        
    def state_val(row):
        matches = state_df[state_df["race_ethnicity"] == row["race_ethnicity"]]
        if len(matches) != 1:
            return None
        
        #print matches.iloc[0]
        return matches.iloc[0]["four_y_percent"]

    df["drg_4y"] = df.apply(drg_val, axis=1)
    df["state_4y"] = df.apply(state_val, axis=1)

    
    df = df[["district_code","district","school_code","school",
             "race_ethnicity","four_y_percent","drg_4y","state_4y"]]
    
    df.columns = ["district_code","district","school_code","school",
             "race","school_rate","drg_rate","state_rate"]
    return df

full_df()#.sort_values(by="school")
    

Unnamed: 0,district_code,district,school_code,school,race,school_rate,drg_rate,state_rate
2,0020011,Ansonia School District,0026111,Ansonia High School,Black or African American,78.30,81.600,78.800
3,0020011,Ansonia School District,0026111,Ansonia High School,Hispanic or Latino,63.60,82.900,79.500
6,0020011,Ansonia School District,0026111,Ansonia High School,White,90.70,90.600,90.525
12,0040011,Avon School District,0046111,Avon High School,White,95.40,94.400,90.525
18,0070011,Berlin School District,0076111,Berlin High School,White,94.00,91.150,90.525
24,0090011,Bethel School District,0096111,Bethel High School,White,91.10,91.150,90.525
27,0110011,Bloomfield School District,0116111,Bloomfield High School,Black or African American,82.50,81.725,78.800
46,0120011,Bolton School District,0126111,Bolton High School,White,93.20,92.400,90.525
52,0140011,Branford School District,0146111,Branford High School,White,91.20,91.150,90.525
55,0150011,Bridgeport School District,0156111,Bassick High School,Black or African American,75.40,75.900,78.800


In [64]:
full_df().to_csv("../clean/for_db/college_pers.csv",index=False)