In [None]:
import pandas as pd

df = pd.read_excel('../data/states-score-data.xlsx', "2. Data")

In [None]:
categories = filter(lambda x: not 'Unnamed' in x, df.columns)
indicatorGroups = list(categories)
indicatorGroups

In [None]:
data = df.drop([0, 1])
data.columns = df.iloc[1]
newColNames = list(data.columns)
for i in range(len(newColNames)):
    if "CR Score (Standardized)" in newColNames[i]:
        newColNames[i] = "CR Score (Standardized)"

data.columns = newColNames

In [None]:
colMap = {
    "50 States": "states",
    "RANK": "rank",
    "Rank": "rank",
    "Rank (based on CR score 1 and 2)": "rank_cr12",
    "Rank (based on CR score 3)": "rank_cr3",
    "Elementary/Middle School Counselor Ratio 2020-21": "ele_mid_counselor_ratio",
    "High School Counselor Ratio 2020-21": "high_counselor_ratio",
    "AP Test Performance of 3 or Higher (2020) (%)": "ap",
    "Fafsa Completion Rate (%) 2021-22": "fafsa",
    "HS Completion Rate (%) 2018-19 (Cohort based)": "hs_completion",
    "Post HS Placement - college (%) 2018-19 (Cohort based)": "post_hs_college",
    "Retention rate PT (%) 2019-20": "retention_pt",
    "Retention rate FT (%) 2019-20": "retention_ft",
    "PS completion (2 years) (%) 2019-20": "ps_completion_2y",
    "PS completion (4 years) (%) 2019-20": "ps_completion_4y",
    "Disconnected youth (%)": "disconnected",
    "Median hourly wage of Non-NEET youth($)": "non_neet_wage",
    "Non-NEET earning more than state's living wage (%)": "non_neet_living_wage",
    "Median hourly wage of all adult ($)": "adult_wage",
    "Adults working full time (1820h) (%)": "adult_full_time",
    "Full time adults earning more than MIT wage (%)": "adult_mit_wage",
    "CR Score (Standardized)": "cr_score",
    "CR Score (0~100 score)": "cr_score100",
    "CR Score 1 (average of 4 standardized scores)": "cr_score1",
    "CR Score 2 (0~100 score from CR score 1)": "cr_score2",
    "CR Score 3 (0~100 score by averaging four 0~100 scores)": "cr_score3",
}

stateAbbrMap = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia (DC)": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

In [None]:
data.rename(columns=colMap, inplace=True)
data.drop("cr_score", axis=1, inplace=True)

### Split data into different catagories

In [None]:
dfReadyYouth = data.iloc[:, 0:10]
dfAdulthood1 = data.iloc[:, ([0] + list(range(10, 17)))]
dfAdulthood2 = data.iloc[:, ([0] + list(range(17, 23)))]
dfSocial = data.iloc[:, ([0] + list(range(23, 29)))]
dfOverall = data.iloc[:, ([0] + list(range(29, 34)))]


In [None]:
dfReadyYouth.head()

In [None]:
def getProp(df):
    prop = {}
    for state in data["states"].tolist():
        prop[state] = {}
        for col in df.columns[1:]:
            val = df[df["states"] == state][col].tolist()[0]
            if pd.isna(val):
                prop[state][col] = None
            else:
                prop[state][col] = val

    return prop

propReadyYouth = getProp(dfReadyYouth)
propAdultHood1 = getProp(dfAdulthood1)
propAdultHood2 = getProp(dfAdulthood2)
propSocial = getProp(dfSocial)
propOverall = getProp(dfOverall)

In [None]:
props = {}
for state in data["states"].tolist():
    props[state] = {
        "name": state,
        "abbr": stateAbbrMap[state],
        "youth": propReadyYouth[state],
        "adulthood1": propAdultHood1[state],
        "adulthood2": propAdultHood2[state],
        "social": propSocial[state],
        "overall": propOverall[state]
    }


### Get States Polygon Coordinates

In [None]:
import json

statesCoors = {}
file = open("../data/states-coors.json", "r")
jsonData = json.load(file)

for feat in jsonData["features"]:
    statesCoors[feat["properties"]["NAME"]] = feat["geometry"]


### Convert Data to GeoJSON

In [None]:
geoJson = {"type": "FeatureCollection", "features": []}

for state in data["states"].tolist():
    geoJson["features"].append({
        "type": "Feature",
        "properties": props[state],
        "geometry": statesCoors[state]
    })


In [None]:
with open("../data/states-careers-score.json", "w") as outfile:
    json.dump(geoJson, outfile)