In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [14]:
def education(file,year):
    # Read in file
    df = pd.read_csv(file)

    # Get rid of spaces in Labels
    df['Label'] = df['Label'].apply(lambda x: x.lstrip())

    # Convert percentage estimates to float
    df['Percentage'] = df['Estimate'].apply(lambda x: float(x.rstrip('%'))/100 if '%' in x else None)

    # Keep only individual population stats
    df = df.iloc[:6]

    # Move Total Population 25+ from row to column
    df['Total Population 25+'] = int(df.iloc[0]['Estimate'].replace(',',''))
    df.dropna(inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['Estimate','Margin of Error'], inplace=True)

    # Calculate population estimates
    df['Population Estimate'] = df['Percentage']*df['Total Population 25+']

    # Drop unnecessary columns
    df.drop(columns=['Percentage','Total Population 25+'], inplace=True)

    # Change index and transpose table
    df.set_index('Label', inplace=True)
    df = df.T

    # Change index to year
    df.index = [year]

    # Calculate extra columns
    df['Total Population 25+'] = df.sum(axis=1)
    df['High School Graduate or Higher'] = df['High school graduate (includes equivalency)']+df['Some college or associate\'s degree']+df['Bachelor\'s degree']+df['Graduate or professional degree']
    df["Bachelor's Degree or Higher"] = df['Bachelor\'s degree']+df['Graduate or professional degree']

    return df

In [15]:
education('data/2014/EDUCATIONAL_ATTAINMENT.CSV',2014)

Label,Less than high school diploma,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,Total Population 25+,High School Graduate or Higher,Bachelor's Degree or Higher
2014,1814.208,3458.334,16101.096,29764.35,5612.706,56750.694,54936.486,35377.056


In [16]:
for year in range(2010,2023):
    file = 'data/'+str(year)+'/EDUCATIONAL_ATTAINMENT.CSV'

    try:
        education(file,year)
    except:
        print(f"No file for year '{year}'")

No file for year '2020'


In [17]:
EDUCATIONAL_ATTAINMENT_2022 = education('data/2022/EDUCATIONAL_ATTAINMENT.CSV',2022)
EDUCATIONAL_ATTAINMENT_2021 = education('data/2021/EDUCATIONAL_ATTAINMENT.CSV',2021)
EDUCATIONAL_ATTAINMENT_2019 = education('data/2019/EDUCATIONAL_ATTAINMENT.CSV',2019)
EDUCATIONAL_ATTAINMENT_2018 = education('data/2018/EDUCATIONAL_ATTAINMENT.CSV',2018)
EDUCATIONAL_ATTAINMENT_2017 = education('data/2017/EDUCATIONAL_ATTAINMENT.CSV',2017)
EDUCATIONAL_ATTAINMENT_2016 = education('data/2016/EDUCATIONAL_ATTAINMENT.CSV',2016)
EDUCATIONAL_ATTAINMENT_2015 = education('data/2015/EDUCATIONAL_ATTAINMENT.CSV',2015)
EDUCATIONAL_ATTAINMENT_2014 = education('data/2014/EDUCATIONAL_ATTAINMENT.CSV',2014)
EDUCATIONAL_ATTAINMENT_2013 = education('data/2013/EDUCATIONAL_ATTAINMENT.CSV',2013)
EDUCATIONAL_ATTAINMENT_2012 = education('data/2012/EDUCATIONAL_ATTAINMENT.CSV',2012)
EDUCATIONAL_ATTAINMENT_2011 = education('data/2011/EDUCATIONAL_ATTAINMENT.CSV',2011)
EDUCATIONAL_ATTAINMENT_2010 = education('data/2010/EDUCATIONAL_ATTAINMENT.CSV',2010)

In [18]:
EDUCATIONAL_ATTAINMENT = pd.concat([EDUCATIONAL_ATTAINMENT_2022,EDUCATIONAL_ATTAINMENT_2021,EDUCATIONAL_ATTAINMENT_2019,EDUCATIONAL_ATTAINMENT_2018,EDUCATIONAL_ATTAINMENT_2017,EDUCATIONAL_ATTAINMENT_2016,EDUCATIONAL_ATTAINMENT_2015,EDUCATIONAL_ATTAINMENT_2014,EDUCATIONAL_ATTAINMENT_2013,EDUCATIONAL_ATTAINMENT_2012,EDUCATIONAL_ATTAINMENT_2011,EDUCATIONAL_ATTAINMENT_2010])
EDUCATIONAL_ATTAINMENT

Label,Less than high school diploma,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,Total Population 25+,High School Graduate or Higher,Bachelor's Degree or Higher
2022,1915.392,4161.024,15323.136,35599.872,9180.672,66180.096,64264.704,44780.544
2021,2599.687,6530.921,13822.726,32908.233,7545.433,63407.0,60807.313,40453.666
2019,1817.676,5258.277,16748.586,33432.255,7725.123,64981.917,63164.241,41157.378
2018,2119.458,5202.306,14643.528,34425.136,7835.572,64226.0,62106.542,42260.708
2017,1446.072,4518.975,14400.467,33078.897,6808.589,60253.0,58806.928,39887.486
2016,2401.564,4475.642,12116.982,28709.606,6822.625,54526.419,52124.855,35532.231
2015,2380.378,4412.408,15269.254,30015.986,5979.974,58058.0,55677.622,35995.96
2014,1814.208,3458.334,16101.096,29764.35,5612.706,56750.694,54936.486,35377.056
2013,2785.831,6045.846,13632.79,28332.494,8476.039,59273.0,56487.169,36808.533
2012,2520.018,4327.857,12490.524,28651.509,6793.092,54783.0,52262.982,35444.601


In [19]:
EDUCATIONAL_ATTAINMENT.reset_index(inplace=True)
EDUCATIONAL_ATTAINMENT.rename(columns={'index':'Year'}, inplace=True)
#EDUCATIONAL_ATTAINMENT.index.names = ['index']
EDUCATIONAL_ATTAINMENT

Label,Year,Less than high school diploma,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,Total Population 25+,High School Graduate or Higher,Bachelor's Degree or Higher
0,2022,1915.392,4161.024,15323.136,35599.872,9180.672,66180.096,64264.704,44780.544
1,2021,2599.687,6530.921,13822.726,32908.233,7545.433,63407.0,60807.313,40453.666
2,2019,1817.676,5258.277,16748.586,33432.255,7725.123,64981.917,63164.241,41157.378
3,2018,2119.458,5202.306,14643.528,34425.136,7835.572,64226.0,62106.542,42260.708
4,2017,1446.072,4518.975,14400.467,33078.897,6808.589,60253.0,58806.928,39887.486
5,2016,2401.564,4475.642,12116.982,28709.606,6822.625,54526.419,52124.855,35532.231
6,2015,2380.378,4412.408,15269.254,30015.986,5979.974,58058.0,55677.622,35995.96
7,2014,1814.208,3458.334,16101.096,29764.35,5612.706,56750.694,54936.486,35377.056
8,2013,2785.831,6045.846,13632.79,28332.494,8476.039,59273.0,56487.169,36808.533
9,2012,2520.018,4327.857,12490.524,28651.509,6793.092,54783.0,52262.982,35444.601


In [20]:
EDUCATIONAL_ATTAINMENT.to_csv('data/final/EDUCATIONAL_ATTAINMENT.CSV', index=False)

In [21]:
df = pd.read_csv('data/final/EDUCATIONAL_ATTAINMENT.CSV')
df

Unnamed: 0,Year,Less than high school diploma,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,Total Population 25+,High School Graduate or Higher,Bachelor's Degree or Higher
0,2022,1915.392,4161.024,15323.136,35599.872,9180.672,66180.096,64264.704,44780.544
1,2021,2599.687,6530.921,13822.726,32908.233,7545.433,63407.0,60807.313,40453.666
2,2019,1817.676,5258.277,16748.586,33432.255,7725.123,64981.917,63164.241,41157.378
3,2018,2119.458,5202.306,14643.528,34425.136,7835.572,64226.0,62106.542,42260.708
4,2017,1446.072,4518.975,14400.467,33078.897,6808.589,60253.0,58806.928,39887.486
5,2016,2401.564,4475.642,12116.982,28709.606,6822.625,54526.419,52124.855,35532.231
6,2015,2380.378,4412.408,15269.254,30015.986,5979.974,58058.0,55677.622,35995.96
7,2014,1814.208,3458.334,16101.096,29764.35,5612.706,56750.694,54936.486,35377.056
8,2013,2785.831,6045.846,13632.79,28332.494,8476.039,59273.0,56487.169,36808.533
9,2012,2520.018,4327.857,12490.524,28651.509,6793.092,54783.0,52262.982,35444.601
