In [1]:
import pandas as pd

df = pd.read_excel('../data/states-score-data.xlsx', "2. Data")

In [2]:
categories = filter(lambda x: not 'Unnamed' in x, df.columns)
indicatorGroups = list(categories)
indicatorGroups

['Launching Future Ready Youth',
 'Navigating into Adulthood 1: Postsecondary outcomes',
 'Navigating into Adulthood 2: Young Adults Deserve a Brighter Future (19-27yrs, 2021)',
 'Social mobility 2021 (34-36yrs)',
 'OVERALL SCORE']

In [3]:
data = df.drop([0, 1])
data.columns = df.iloc[1]
newColNames = list(data.columns)
for i in range(len(newColNames)):
    if "standardized" in newColNames[i].lower():
        newColNames[i] = "CR Score (Standardized)"

data.columns = newColNames

In [4]:
colMap = {
    "50 States": "states",
    "RANK": "rank",
    "Rank": "rank",
    "Rank (based on CR score 1 and 2)": "rank_cr12",
    "Rank (based on CR score 3)": "rank_cr3",
    "Elementary/Middle School Counselor Ratio 2020-21": "ele_mid_counselor_ratio",
    "High School Counselor Ratio 2020-21": "high_counselor_ratio",
    "AP Test Performance of 3 or Higher (2020) (%)": "ap",
    "Fafsa Completion Rate (%) 2021-22": "fafsa",
    "HS Completion Rate (%) 2018-19 (Cohort based)": "hs_completion",
    "Post HS Placement - college (%) 2018-19 (Cohort based)": "post_hs_college",
    "Retention rate PT (%) 2019-20": "retention_pt",
    "Retention rate FT (%) 2019-20": "retention_ft",
    "PS completion (2 years) (%) 2019-20": "ps_completion_2y",
    "PS completion (4 years) (%) 2019-20": "ps_completion_4y",
    "Disconnected youth (%)": "disconnected",
    "Median hourly wage of Non-NEET youth($)": "non_neet_wage",
    "Non-NEET earning more than state's living wage (%)": "non_neet_living_wage",
    "Median hourly wage of all adult ($)": "adult_wage",
    "Adults working full time (1820h) (%)": "adult_full_time",
    "Full time adults earning more than MIT wage (%)": "adult_mit_wage",
    "CR Score (Standardized)": "cr_score",
    "CR Score (0~100 score)": "cr_score100",
    "CR Score 1 (average of 4 standardized scores)": "cr_score1",
    "CR Score 2 (0~100 score from CR score 1)": "cr_score2",
    "CR Score 3 (0~100 score by averaging four 0~100 scores)": "cr_score3",
}

stateAbbrMap = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia (DC)": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
    "National": "US"
}

In [5]:
dataCols = list(data.columns)
for col in dataCols:
    if "%" in col:
        data[col] *= 100

In [6]:
data.rename(columns=colMap, inplace=True)
data.drop("cr_score", axis=1, inplace=True)

### Split data into different catagories

In [7]:
dfReadyYouth = data.iloc[:, 0:9]
dfAdulthood1 = data.iloc[:, ([0] + list(range(9, 15)))]
dfAdulthood2 = data.iloc[:, ([0] + list(range(15, 20)))]
dfSocial = data.iloc[:, ([0] + list(range(20, 25)))]
dfOverall = data.iloc[:, ([0] + list(range(25, 29)))]


In [8]:
def roundToInteger(df):
    for col in df.columns:
        if "ratio" in col.lower():
            df.iloc[:, df.columns.get_loc(col)] = df[col].astype(float).round(0).astype(int)
        elif col in list(dict(filter(lambda x: "$" in x[0], colMap.items())).values()):
            df.iloc[:, df.columns.get_loc(col)] = df[col].astype(float).round(2)
        elif col not in ['states', 'rank', 'rank_cr12', 'rank_cr3']:
            df.iloc[:, df.columns.get_loc(col)] = df[col].astype(float).round(1)

    return df

dfReadyYouth = roundToInteger(dfReadyYouth)
dfAdulthood1 = roundToInteger(dfAdulthood1)
dfAdulthood2 = roundToInteger(dfAdulthood2)
dfSocial = roundToInteger(dfSocial)
dfOverall = roundToInteger(dfOverall)

In [9]:
dfReadyYouth.head()

Unnamed: 0,states,rank,ele_mid_counselor_ratio,high_counselor_ratio,ap,fafsa,hs_completion,post_hs_college,cr_score100
2,Massachusetts,4,547,191,73.0,69.3,88.0,68.0,86.0
3,Connecticut,3,533,182,75.0,70.0,89.0,71.0,95.1
4,New Jersey,5,1048,296,75.0,72.9,91.0,68.0,80.2
5,New Hampshire,1,215,160,74.0,64.3,88.0,60.0,100.0
6,Delaware,6,660,183,65.0,70.3,89.0,68.0,79.7


In [10]:
def getProp(df):
    prop = {}
    for state in data["states"].tolist():
        prop[state] = {}
        for col in df.columns[1:]:
            val = df[df["states"] == state][col].tolist()[0]
            if pd.isna(val):
                prop[state][col] = None
            else:
                prop[state][col] = val

    return prop

propReadyYouth = getProp(dfReadyYouth)
propAdultHood1 = getProp(dfAdulthood1)
propAdultHood2 = getProp(dfAdulthood2)
propSocial = getProp(dfSocial)
propOverall = getProp(dfOverall)

In [11]:
propReadyYouth

{'Massachusetts': {'rank': 4,
  'ele_mid_counselor_ratio': 547,
  'high_counselor_ratio': 191,
  'ap': 73.0,
  'fafsa': 69.3,
  'hs_completion': 88.0,
  'post_hs_college': 68.0,
  'cr_score100': 86.0},
 'Connecticut': {'rank': 3,
  'ele_mid_counselor_ratio': 533,
  'high_counselor_ratio': 182,
  'ap': 75.0,
  'fafsa': 70.0,
  'hs_completion': 89.0,
  'post_hs_college': 71.0,
  'cr_score100': 95.1},
 'New Jersey': {'rank': 5,
  'ele_mid_counselor_ratio': 1048,
  'high_counselor_ratio': 296,
  'ap': 75.0,
  'fafsa': 72.9,
  'hs_completion': 91.0,
  'post_hs_college': 68.0,
  'cr_score100': 80.2},
 'New Hampshire': {'rank': 1,
  'ele_mid_counselor_ratio': 215,
  'high_counselor_ratio': 160,
  'ap': 74.0,
  'fafsa': 64.3,
  'hs_completion': 88.0,
  'post_hs_college': 60.0,
  'cr_score100': 100.0},
 'Delaware': {'rank': 6,
  'ele_mid_counselor_ratio': 660,
  'high_counselor_ratio': 183,
  'ap': 65.0,
  'fafsa': 70.3,
  'hs_completion': 89.0,
  'post_hs_college': 68.0,
  'cr_score100': 79.7}

In [12]:
props = {}
for state in data["states"].tolist():
    props[state] = {
        "name": state,
        "abbr": stateAbbrMap[state],
        "youth": propReadyYouth[state],
        "adulthood1": propAdultHood1[state],
        "adulthood2": propAdultHood2[state],
        "social": propSocial[state],
        "overall": propOverall[state]
    }


### Get States Polygon Coordinates

In [13]:
# import json

# statesCoors = {}
# file = open("../data/states-coors.json", "r")
# jsonData = json.load(file)

# for feat in jsonData["features"]:
#     statesCoors[feat["properties"]["NAME"]] = feat["geometry"]


### Convert Data to GeoJSON

In [14]:
import json

geoJson = {"type": "FeatureCollection", "features": []}

for state in data["states"].tolist():
    geoJson["features"].append({
        "type": "Feature",
        "properties": props[state],
    })


In [15]:
with open("../data/states-careers-score.json", "w") as outfile:
    json.dump(geoJson, outfile)