# Language Spoken at Home and Ability to Speak English Script
Run all of these code blocks in order to create the LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH data table.

### Import Statements

In [51]:
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

### Function to create row for each year

In [52]:
def read_file_details(file,year):
    # Read in file
    df = pd.read_csv(file)

    # Get rid of spaces in Labels
    df['Label'] = df['Label'].apply(lambda x: x.lstrip())

    # Convert percentage estimates to float
    df['Percentage'] = df['Estimate'].apply(lambda x: float(x.rstrip('%'))/100 if '%' in x else None)

    # Move Total Population 5 years and over from row to column
    df['total_pop_5_plus'] = int(df.iloc[0]['Estimate'].replace(',',''))
    df.dropna(inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['Estimate','Margin of Error'], inplace=True)

    # Calculate first population estimates
    df1 = df.iloc[:2]
    df1['estimate'] = df1['Percentage']*df1['total_pop_5_plus']
    df1.drop(columns=['Percentage'], inplace=True)
    ne_est = df1.iat[1,2]

    # Calculate second population estimate
    df2 = df.iloc[[2]]
    df2['estimate'] = df2['Percentage'].apply(lambda x: x*ne_est)
    df2.drop(columns=['Percentage'], inplace=True)

    # Combine estimate dataframes back together
    df = pd.concat([df1,df2])

    # Move Total Population 5+ back to row from column
    column_as_df = df[['total_pop_5_plus']]
    row = column_as_df.transpose()
    row.reset_index(inplace=True)
    row.drop(columns=[2,3], inplace=True)
    new_col_names = ['Label','estimate']
    row.columns = new_col_names
    df.drop(columns=['total_pop_5_plus'], inplace=True)
    df = pd.concat([df,row])

    # Change index and transpose table
    df.set_index('Label', inplace=True)
    df = df.T

    # Change index to Year
    df.index = [year]

    # Rename all columns with abbreviated underscored naming conventions
    col_names = ['eng_only','lang_other_than_eng','eng_lt_very_well','total_pop_5_plus']
    df.columns = col_names

    return df

### Test that function works

In [53]:
read_file_details('data/2010/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV', 2010)

Unnamed: 0,eng_only,lang_other_than_eng,eng_lt_very_well,total_pop_5_plus
2010,21816.066,45725.934,9282.364602,67542.0


### Test that function works for each year
Should return only "No file for year '2020'"

In [54]:
for year in range(2010,2023):
    file = 'data/'+str(year)+'/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV'

    try:
        read_file_details(file,year)
    except:
        print(f"No file for year '{year}'")

No file for year '2020'


### Run function for each year

In [55]:
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2022 = read_file_details('data/2022/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2022)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2021 = read_file_details('data/2021/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2021)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2019 = read_file_details('data/2019/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2019)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2018 = read_file_details('data/2018/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2018)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2017 = read_file_details('data/2017/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2017)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2016 = read_file_details('data/2016/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2016)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2015 = read_file_details('data/2015/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2015)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2014 = read_file_details('data/2014/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2014)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2013 = read_file_details('data/2013/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2013)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2012 = read_file_details('data/2012/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2012)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2011 = read_file_details('data/2011/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2011)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2010 = read_file_details('data/2010/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV',2010)

### Combine each year into single dataframe containing all years

In [56]:
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH = pd.concat([LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2022,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2021,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2019,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2018,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2017,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2016,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2015,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2014,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2013,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2012,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2011,LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH_2010])
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH

Unnamed: 0,eng_only,lang_other_than_eng,eng_lt_very_well,total_pop_5_plus
2022,34423.2,51634.8,8623.0116,86058.0
2021,37797.75,46197.25,7391.56,83995.0
2019,34196.558,51724.442,8586.257372,85921.0
2018,30771.816,53304.184,10287.707512,84076.0
2017,29179.417,49049.583,9466.569519,78229.0
2016,28135.146,43273.854,8048.936844,71409.0
2015,28323.592,48852.408,8842.285848,77176.0
2014,25456.985,50534.015,8944.520655,75991.0
2013,25771.228,52560.772,10880.079804,78332.0
2012,23108.36,49331.64,11099.619,72440.0


### Reset index on combined dataframe

In [57]:
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.reset_index(inplace=True)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.rename(columns={'index':'year'}, inplace=True)
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH

Unnamed: 0,year,eng_only,lang_other_than_eng,eng_lt_very_well,total_pop_5_plus
0,2022,34423.2,51634.8,8623.0116,86058.0
1,2021,37797.75,46197.25,7391.56,83995.0
2,2019,34196.558,51724.442,8586.257372,85921.0
3,2018,30771.816,53304.184,10287.707512,84076.0
4,2017,29179.417,49049.583,9466.569519,78229.0
5,2016,28135.146,43273.854,8048.936844,71409.0
6,2015,28323.592,48852.408,8842.285848,77176.0
7,2014,25456.985,50534.015,8944.520655,75991.0
8,2013,25771.228,52560.772,10880.079804,78332.0
9,2012,23108.36,49331.64,11099.619,72440.0


### Save final dataframe as CSV to 'final' folder in 'data' folder

In [58]:
LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.to_csv("data/final/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV", index=False)

### Check that dataframe was saved properly as CSV

In [59]:
df = pd.read_csv('data/final/LANGUAGE_SPOKEN_AT_HOME_AND_ABILITY_TO_SPEAK_ENGLISH.CSV')
df

Unnamed: 0,year,eng_only,lang_other_than_eng,eng_lt_very_well,total_pop_5_plus
0,2022,34423.2,51634.8,8623.0116,86058.0
1,2021,37797.75,46197.25,7391.56,83995.0
2,2019,34196.558,51724.442,8586.257372,85921.0
3,2018,30771.816,53304.184,10287.707512,84076.0
4,2017,29179.417,49049.583,9466.569519,78229.0
5,2016,28135.146,43273.854,8048.936844,71409.0
6,2015,28323.592,48852.408,8842.285848,77176.0
7,2014,25456.985,50534.015,8944.520655,75991.0
8,2013,25771.228,52560.772,10880.079804,78332.0
9,2012,23108.36,49331.64,11099.619,72440.0
