In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pandas as pd

In [40]:
# a basic linear regression requires you to have a bunch of predictor variables and a target variable
# I'm going to try to build a model for the 2022 unemployment rate based on a county's urban influence code, median income in 2021, and state_average_unemployment

df = pd.read_csv("data/county_unemployment.csv")
df = df[df.Year == 2022]
df['Workforce'] = df.Employed + df.Unemployed
df['Unemployment Rate'] = df.Unemployed/df.Workforce
df_state = df.groupby("State").sum()
state_u = df_state.Unemployed/df_state.Workforce
df['State Unemployment'] = df.apply(lambda x: state_u[x.State],axis=1)
df


Unnamed: 0,FIPS_Code,State,Area_Name,Rural_Urban_Continuum_Code_2013,Urban_Influence_Code_2013,Metro_2013,Median_Household_Income_2021,Year,Employed,Unemployed,Workforce,Unemployment Rate,State Unemployment
22,1001,AL,Autauga,2.0,2.0,1.0,66444.0,2022,26181.0,608.0,26789.0,0.022696,0.025968
45,1003,AL,Baldwin,3.0,2.0,1.0,65658.0,2022,100432.0,2417.0,102849.0,0.023500,0.025968
68,1005,AL,Barbour,6.0,6.0,0.0,38649.0,2022,7906.0,335.0,8241.0,0.040650,0.025968
91,1007,AL,Bibb,1.0,1.0,1.0,48454.0,2022,8507.0,219.0,8726.0,0.025097,0.025968
114,1009,AL,Blount,1.0,1.0,1.0,56894.0,2022,25222.0,574.0,25796.0,0.022252,0.025968
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72040,56037,WY,Sweetwater,5.0,8.0,0.0,74677.0,2022,18696.0,811.0,19507.0,0.041575,0.035694
72063,56039,WY,Teton,7.0,8.0,0.0,102709.0,2022,16193.0,454.0,16647.0,0.027272,0.035694
72086,56041,WY,Uinta,7.0,8.0,0.0,70162.0,2022,8524.0,339.0,8863.0,0.038249,0.035694
72109,56043,WY,Washakie,7.0,11.0,0.0,62176.0,2022,3704.0,154.0,3858.0,0.039917,0.035694


In [39]:
features = df[["Rural_Urban_Continuum_Code_2013","Median_Household_Income_2021", "State Unemployment"]]
target = df["Unemployment Rate"]
lr = LinearRegression()
lr.fit(features,target)
print("R2 = ", lr.score(features,target)) # this is the coefficient of determination
print("MAE = ", mean_absolute_error(target,lr.predict(features))) # how far it's off
# but what is "good"?  A good score would be better than a very naive prediction, which would be the average unemployment rate
average_unemployment = df["Unemployment Rate"].mean()
delta_unemployment = (df["Unemployment Rate"] - average_unemployment)
print("Naive error = ", delta_unemployment.abs().mean())
# There are many things that could also be done here (e.g., 5 year prediction model)


R2 =  0.45847601146159767
MAE =  0.0061660684101754034
Naive error =  0.00912157951504589
