In [107]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

In [108]:
states = pd.read_excel('aspe-uninsured-estimates-by-state.xlsx', sheet_name='All Uninsured (%)')
states = states.drop([51])
states = states.replace('**', 0)
states

Unnamed: 0,State Name,Total Non-Elderly Population (Excluding Undocumented),Uninsured Population (Excluding Undocumented),Percent Uninsured,% HIU Income < 100% FPL,% HIU Income 100-138% FPL,% HIU Income 139-249% FPL,% HIU Income 250-400% FPL,% HIU Income 400% FPL,% Age 0-18,...,% College Grad,% No English Speaking Adults in HH,% English Spoken in HH,% Spanish Spoken in HH,% Chinese Spoken in HH,% Korean Spoken in HH,% Vietnamese Spoken in HH,% Tagalog Spoken in HH,% Russian Spoken in HH,% Other Language Spoken in HH
0,Alabama,3986500,456100,0.11,0.47,0.11,0.21,0.13,0.09,0.08,...,0.08,0.02,0.97,0.03,0.0,0.0,0.0,0.0,0.0,0.0
1,Alaska,629300,80300,0.13,0.26,0.1,0.28,0.22,0.14,0.17,...,0.13,0.01,0.97,0.01,0.0,0.0,0.0,0.0,0.0,0.02
2,Arizona,5720700,698800,0.12,0.32,0.09,0.28,0.19,0.13,0.21,...,0.15,0.07,0.9,0.08,0.0,0.0,0.0,0.0,0.0,0.02
3,Arkansas,2443900,257200,0.11,0.34,0.11,0.31,0.15,0.08,0.15,...,0.07,0.04,0.95,0.04,0.0,0.0,0.0,0.0,0.0,0.01
4,California,32060700,2397600,0.07,0.3,0.08,0.26,0.19,0.16,0.14,...,0.17,0.14,0.75,0.19,0.01,0.01,0.01,0.0,0.0,0.02
5,Colorado,4758400,392300,0.08,0.23,0.07,0.26,0.23,0.21,0.16,...,0.21,0.07,0.91,0.06,0.0,0.0,0.0,0.0,0.0,0.02
6,Connecticut,2795000,142200,0.05,0.22,0.07,0.25,0.21,0.25,0.15,...,0.23,0.1,0.85,0.08,0.0,0.0,0.0,0.0,0.0,0.06
7,Delaware,757200,57800,0.08,0.32,0.07,0.28,0.18,0.15,0.15,...,0.19,0.05,0.91,0.04,0.0,0.0,0.0,0.0,0.0,0.03
8,District of Columbia,601300,22100,0.04,0.41,0.07,0.2,0.12,0.2,0.12,...,0.29,0.03,0.97,0.02,0.0,0.0,0.0,0.0,0.0,0.0
9,Florida,16240100,2519500,0.16,0.35,0.09,0.26,0.17,0.13,0.12,...,0.16,0.11,0.84,0.14,0.0,0.0,0.0,0.0,0.0,0.02


# Run Linear Regression Between Groups and Within Groups

In [109]:
# grouping variables
income = ['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL',
           '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL','% HIU Income 400% FPL']
age = ['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
gender = ['% Male', '% Female']
origin = ['% Spanish/Hispanic/Latino Origin', '% White Non-Latino',
          '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander',
          '% American Indian / Alaska Native', '% Multi-racial or Other']
employment = ['% Employed in Agriculture Industry', 
             '% Employed in Mining/Construction Industry','% Employed in Manufacturing Industry', 
             '% Employed in Trade Industry','% Employed in Info/Finance Industry', 
             '% Employed in Education/Health Industry','% Employed in Entertainment Industry',
             '% Employed in Service Industry','% Employed in Military/Public Industry']
education = ['% Less than High School', '% High School Diploma', '% College Grad']
language = ['% No English Speaking Adults in HH', '% English Spoken in HH',
            '% Spanish Spoken in HH', '% Chinese Spoken in HH',
            '% Korean Spoken in HH', '% Vietnamese Spoken in HH',
            '% Tagalog Spoken in HH', '% Russian Spoken in HH',
            '% Other Language Spoken in HH']
new_cols = [income, age, gender, '% Married', '% Child in Family', origin,
          '% SNAP Recipient', '% With a disability','% Full-time Worker in Family',employment, education, 
            language]

In [110]:
# mutify the function to take in two sets of data and calculate 
# the linear correlation between them
def perform_linear(col1,col2):
    x = states[col1]
    y = states[col2]
    lr = LinearRegression()
    reg = lr.fit(x, y)
    y_pred = lr.predict(x)
    
    r2 = reg.score(x, y)
    rms = np.sqrt(mean_squared_error(y, y_pred))
    print('R^2:', r2)
    print('Root Mean Square:', rms)

In [111]:
#finding the R^2 and rms between the group of income variables and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(income,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 1.0
Root Mean Square: 4.3756153221168447e-17
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.13469044609496347
Root Mean Square: 0.029317173238997695
['% Male', '% Female']
R^2: 0.17392220290156135
Root Mean Square: 0.031287490089363934
% Married
R^2: 0.22091459800922486
Root Mean Square: 0.030655423471557742
% Child in Family
R^2: 0.34192102979646155
Root Mean Square: 0.049268425206810205
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.17443772583192474
Root Mean Square: 0.0981294996802577
% SNAP Recipient
R^2: 0.481818956273817
Root Mean Square: 0.03604300701812119
% With a disability
R^2: 0.3481270335977521
Root Mean Square: 0.018111230365931173
% Full-time Worker in Family
R^

In [112]:
#finding the R^2 and rms between the group of age variables and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(age,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.14416423704716527
Root Mean Square: 0.039420694029484
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 1.0
Root Mean Square: 3.3252933287712145e-17
['% Male', '% Female']
R^2: 0.46230935628817843
Root Mean Square: 0.025242158532606356
% Married
R^2: 0.4697386935568507
Root Mean Square: 0.02529063624145211
% Child in Family
R^2: 0.7788212143926594
Root Mean Square: 0.028562841455759255
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.19009019741100244
Root Mean Square: 0.1007482846334878
% SNAP Recipient
R^2: 0.126988665033118
Root Mean Square: 0.046783229348381264
% With a disability
R^2: 0.04888867627429072
Root Mean Square: 0.021876716826775187
% Full-time Worker in Family
R^2: 0

In [113]:
#finding the R^2 and rms between the group of gender variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(gender,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.06116250277305082
Root Mean Square: 0.041363481287104244
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.14357858648590302
Root Mean Square: 0.02891808908628674
['% Male', '% Female']
R^2: 1.0
Root Mean Square: 3.9635269915872887e-17
% Married
R^2: 0.4052873347617779
Root Mean Square: 0.02678356483101098
% Child in Family
R^2: 0.4189691753763283
Root Mean Square: 0.04629448542237025
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.049601492003008806
Root Mean Square: 0.10741675386869128
% SNAP Recipient
R^2: 0.046474926379495574
Root Mean Square: 0.04889295820647535
% With a disability
R^2: 0.09867532073354868
Root Mean Square: 0.021296444385522428
% Full-time Worker in Family
R

In [114]:
#finding the R^2 and rms between the married condition and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(['% Married'],col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.05390187190608649
Root Mean Square: 0.04112159300906002
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.16704503915821217
Root Mean Square: 0.029139557202815698
['% Male', '% Female']
R^2: 0.3801451673393695
Root Mean Square: 0.02710224369568119
% Married
R^2: 1.0
Root Mean Square: 1.229037546096733e-17
% Child in Family
R^2: 0.15694576830100504
Root Mean Square: 0.0557644510034097
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.12451227116145797
Root Mean Square: 0.10092545167626601
% SNAP Recipient
R^2: 0.014158474560785783
Root Mean Square: 0.04971458396980963
% With a disability
R^2: 0.002386862825068947
Root Mean Square: 0.02240513416812255
% Full-time Worker in Family
R^2

In [115]:
#finding the R^2 and rms between the child_in_Family condition and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(['% Child in Family'],col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.07652065569640545
Root Mean Square: 0.041099609182615134
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.28985033888191114
Root Mean Square: 0.025103268247437152
['% Male', '% Female']
R^2: 0.4187009305067846
Root Mean Square: 0.026245815052751257
% Married
R^2: 0.15694576830100482
Root Mean Square: 0.031889120581485166
% Child in Family
R^2: 1.0
Root Mean Square: 4.8074067159589095e-17
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.0518756359583698
Root Mean Square: 0.10854822077305432
% SNAP Recipient
R^2: 0.013549889759129297
Root Mean Square: 0.049729926634684285
% With a disability
R^2: 0.012793288282075377
Root Mean Square: 0.022287970221017143
% Full-time Worker in Fami

In [116]:
#finding the R^2 and rms between the group of origin variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(origin,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.2722983839390108
Root Mean Square: 0.03495605143566238
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.33444953612624845
Root Mean Square: 0.02600207487391985
['% Male', '% Female']
R^2: 0.17332013537507784
Root Mean Square: 0.03129888959121728
% Married
R^2: 0.6469177467156346
Root Mean Square: 0.02063729304559775
% Child in Family
R^2: 0.3363374498884444
Root Mean Square: 0.04947699677907675
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 1.0
Root Mean Square: 9.681776059659017e-17
% SNAP Recipient
R^2: 0.15205941186723138
Root Mean Square: 0.04610658620988621
% With a disability
R^2: 0.25244426751681504
Root Mean Square: 0.019394931819054183
% Full-time Worker in Family
R^2: 0

In [117]:
#finding the R^2 and rms between the SNAP recipient condition and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(['% SNAP Recipient'],col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.25513289216807317
Root Mean Square: 0.03383779453467455
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.04588723729852123
Root Mean Square: 0.03094861422777898
['% Male', '% Female']
R^2: 0.042012122134640395
Root Mean Square: 0.03369304461912621
% Married
R^2: 0.014158474560785117
Root Mean Square: 0.03448405587442394
% Child in Family
R^2: 0.013549889759129075
Root Mean Square: 0.060320820107935845
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.027167777838703815
Root Mean Square: 0.10916245320244872
% SNAP Recipient
R^2: 1.0
Root Mean Square: 1.3179959344295418e-17
% With a disability
R^2: 0.22091751859927222
Root Mean Square: 0.019799680786971988
% Full-time Worker in Fami

In [118]:
#finding the R^2 and rms between the disability condition and
#other vairables
for col in new_cols:
    print(col)
    perform_linear(['% With a disability'],col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.18721147908634353
Root Mean Square: 0.036872706110898275
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.01043593615443622
Root Mean Square: 0.03167612039341446
['% Male', '% Female']
R^2: 0.09202973580220697
Root Mean Square: 0.03280167815922143
% Married
R^2: 0.002386862825068725
Root Mean Square: 0.034689326351199236
% Child in Family
R^2: 0.01279328828207571
Root Mean Square: 0.060343948532421796
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.06008014531601036
Root Mean Square: 0.10701256970632594
% SNAP Recipient
R^2: 0.22091751859927256
Root Mean Square: 0.044194881636665774
% With a disability
R^2: 1.0
Root Mean Square: 1.605411917726113e-17
% Full-time Worker in Family

In [119]:
#finding the R^2 and rms between full time worker variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(['% Full-time Worker in Family'],col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.371660261379708
Root Mean Square: 0.028681428202707214
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.09462181474167183
Root Mean Square: 0.02981020544755352
['% Male', '% Female']
R^2: 0.05949938286038092
Root Mean Square: 0.033384109231041445
% Married
R^2: 0.2524752820613665
Root Mean Square: 0.030028080071080192
% Child in Family
R^2: 0.2669873715690301
Root Mean Square: 0.05199785193529589
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.13550122169656778
Root Mean Square: 0.09917036745510552
% SNAP Recipient
R^2: 0.29523136473401923
Root Mean Square: 0.04203427292810468
% With a disability
R^2: 0.1105708128715649
Root Mean Square: 0.021155444638860498
% Full-time Worker i

In [120]:
#finding the R^2 and rms between the group of employment variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(employment,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.39492044858085407
Root Mean Square: 0.032952786115352524
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.44105957615689084
Root Mean Square: 0.023408101185650534
['% Male', '% Female']
R^2: 0.4178030087431288
Root Mean Square: 0.02626607793982943
% Married
R^2: 0.6012804546184766
Root Mean Square: 0.021930500792819368
% Child in Family
R^2: 0.5831252519405847
Root Mean Square: 0.039213216668060664
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.39589636904040143
Root Mean Square: 0.07999723491614447
% SNAP Recipient
R^2: 0.20975432302159935
Root Mean Square: 0.04451038184817805
% With a disability
R^2: 0.3540643855185788
Root Mean Square: 0.01802856186199141
% Full-time Worker 

In [121]:
#finding the R^2 and rms between the group of education variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(education,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.32543903716053446
Root Mean Square: 0.03467237460430579
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.12538943929158733
Root Mean Square: 0.029818700647882782
['% Male', '% Female']
R^2: 0.19875765895826947
Root Mean Square: 0.030813582637487562
% Married
R^2: 0.1315470880323566
Root Mean Square: 0.032365917607312285
% Child in Family
R^2: 0.10541714563939464
Root Mean Square: 0.057443376510222625
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.2096599303132711
Root Mean Square: 0.09857410218341571
% SNAP Recipient
R^2: 0.15822736238835333
Root Mean Square: 0.04593858966081443
% With a disability
R^2: 0.26467379339217156
Root Mean Square: 0.019235633388823482
% Full-time Work

In [122]:
#finding the R^2 and rms between the group of spoken language variables and
#other vairables
for col in new_cols :
    print(col)
    perform_linear(language,col)

['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL', '% HIU Income 400% FPL']
R^2: 0.2410715053227371
Root Mean Square: 0.036900218378628176
['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
R^2: 0.17390460516188452
Root Mean Square: 0.029092560253964478
['% Male', '% Female']
R^2: 0.26928874514639767
Root Mean Square: 0.029426129953228058
% Married
R^2: 0.14662607660239746
Root Mean Square: 0.03208370173987188
% Child in Family
R^2: 0.1548561955598753
Root Mean Square: 0.05583351640776113
['% Spanish/Hispanic/Latino Origin', '% White Non-Latino', '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander', '% American Indian / Alaska Native', '% Multi-racial or Other']
R^2: 0.5019633424285824
Root Mean Square: 0.08113159821355982
% SNAP Recipient
R^2: 0.11353442502743305
Root Mean Square: 0.04714234622644926
% With a disability
R^2: 0.3915161480364254
Root Mean Square: 0.017498104176971317
% Full-time Worker 

In [123]:
# re-define vairable groups to separate individual variables as columns
income_new = [['% HIU Income < 100% FPL'], ['% HIU Income 100-138% FPL'],
          ['% HIU Income 139-249% FPL'], ['% HIU Income 250-400% FPL'],
          ['% HIU Income 400% FPL']]
age_new = [['% Age 0-18'], ['% Age 19-34'], ['% Age 35-49'], ['% Age 50-64']]
gender_new = [['% Male'], ['% Female']]
origin_new = [['% Spanish/Hispanic/Latino Origin'], ['% White Non-Latino'],
          ['% Black Non-Latino'], ['% Asian / Native-Hawaiian / Pac Islander'],
          ['% American Indian / Alaska Native'], ['% Multi-racial or Other']]
employment_new = [['% Employed in Agriculture Industry'], 
             ['% Employed in Mining/Construction Industry'],['% Employed in Manufacturing Industry'], 
             ['% Employed in Trade Industry'],['% Employed in Info/Finance Industry'], 
             ['% Employed in Education/Health Industry'],['% Employed in Entertainment Industry'],
             ['% Employed in Service Industry'],['% Employed in Military/Public Industry']]
education_new = [['% Less than High School'], ['% High School Diploma'], ['% College Grad']]
language_new = [['% No English Speaking Adults in HH'], ['% English Spoken in HH'],
            ['% Spanish Spoken in HH'], ['% Chinese Spoken in HH'],
            ['% Korean Spoken in HH'], ['% Vietnamese Spoken in HH'],
            ['% Tagalog Spoken in HH'], ['% Russian Spoken in HH'],
            ['% Other Language Spoken in HH']]

In [124]:
# finding the R^2 and rms between variables in group of income
for col1 in income_new:
    for col2 in income_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% HIU Income < 100% FPL']
['% HIU Income < 100% FPL']
R^2: 1.0
Root Mean Square: 4.310401311189063e-17
['% HIU Income < 100% FPL']
['% HIU Income 100-138% FPL']
R^2: 0.1569844970174067
Root Mean Square: 0.01650601721205957
['% HIU Income < 100% FPL']
['% HIU Income 139-249% FPL']
R^2: 0.21123250185779807
Root Mean Square: 0.024916352974247123
['% HIU Income < 100% FPL']
['% HIU Income 250-400% FPL']
R^2: 0.7425387656518447
Root Mean Square: 0.01675360357620652
['% HIU Income < 100% FPL']
['% HIU Income 400% FPL']
R^2: 0.477235093134902
Root Mean Square: 0.03393775305794642
['% HIU Income 100-138% FPL']
['% HIU Income < 100% FPL']
R^2: 0.15698449701740647
Root Mean Square: 0.06182018359100534
['% HIU Income 100-138% FPL']
['% HIU Income 100-138% FPL']
R^2: 1.0
Root Mean Square: 0.0
['% HIU Income 100-138% FPL']
['% HIU Income 139-249% FPL']
R^2: 0.00031007966177865054
Root Mean Square: 0.028050630525670864
['% HIU Income 100-138% FPL']
['% HIU Income 250-400% FPL']
R^2: 0.134447253676

In [125]:
# finding the R^2 and rms between variables in group of age
for col1 in age_new:
    for col2 in age_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% Age 0-18']
['% Age 0-18']
R^2: 1.0
Root Mean Square: 2.784048121504467e-17
['% Age 0-18']
['% Age 19-34']
R^2: 0.1585044315516384
Root Mean Square: 0.03156683762422529
['% Age 0-18']
['% Age 35-49']
R^2: 0.3655470897643246
Root Mean Square: 0.018392318144082875
['% Age 0-18']
['% Age 50-64']
R^2: 0.19570920365387878
Root Mean Square: 0.024504091442670302
['% Age 19-34']
['% Age 0-18']
R^2: 0.1585044315516384
Root Mean Square: 0.03654480034840237
['% Age 19-34']
['% Age 19-34']
R^2: 1.0
Root Mean Square: 3.2978537999661915e-17
['% Age 19-34']
['% Age 35-49']
R^2: 0.058787464908530485
Root Mean Square: 0.02240168184318884
['% Age 19-34']
['% Age 50-64']
R^2: 0.22347845487698925
Root Mean Square: 0.02407735687782516
['% Age 35-49']
['% Age 0-18']
R^2: 0.36554708976432493
Root Mean Square: 0.03173214089201162
['% Age 35-49']
['% Age 19-34']
R^2: 0.05878746490853071
Root Mean Square: 0.03338481784400975
['% Age 35-49']
['% Age 35-49']
R^2: 1.0
Root Mean Square: 0.0
['% Age 35-49']
['% A

In [126]:
# finding the R^2 and rms between variables in group of origin
for col1 in origin_new:
    for col2 in origin_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% Spanish/Hispanic/Latino Origin']
['% Spanish/Hispanic/Latino Origin']
R^2: 1.0
Root Mean Square: 8.527762723407564e-17
['% Spanish/Hispanic/Latino Origin']
['% White Non-Latino']
R^2: 0.40652726028985364
Root Mean Square: 0.13254201869404725
['% Spanish/Hispanic/Latino Origin']
['% Black Non-Latino']
R^2: 0.03818246898676536
Root Mean Square: 0.13086172077109884
['% Spanish/Hispanic/Latino Origin']
['% Asian / Native-Hawaiian / Pac Islander']
R^2: 0.020696281251902038
Root Mean Square: 0.0635019883316531
['% Spanish/Hispanic/Latino Origin']
['% American Indian / Alaska Native']
R^2: 0.0006159549993384195
Root Mean Square: 0.07189347363577639
['% Spanish/Hispanic/Latino Origin']
['% Multi-racial or Other']
R^2: 0.012277494894090024
Root Mean Square: 0.02605885542575207
['% White Non-Latino']
['% Spanish/Hispanic/Latino Origin']
R^2: 0.40652726028985375
Root Mean Square: 0.1015579300160963
['% White Non-Latino']
['% White Non-Latino']
R^2: 1.0
Root Mean Square: 4.7441486082078665e-17

In [127]:
# finding the R^2 and rms between variables in group of origin
for col1 in employment_new:
    for col2 in employment_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% Employed in Agriculture Industry']
['% Employed in Agriculture Industry']
R^2: 1.0
Root Mean Square: 3.383343034060412e-18
['% Employed in Agriculture Industry']
['% Employed in Mining/Construction Industry']
R^2: 0.14369309914313155
Root Mean Square: 0.015878234248331492
['% Employed in Agriculture Industry']
['% Employed in Manufacturing Industry']
R^2: 0.05209324421203276
Root Mean Square: 0.02562512107100393
['% Employed in Agriculture Industry']
['% Employed in Trade Industry']
R^2: 0.08013976839263337
Root Mean Square: 0.019538083148022533
['% Employed in Agriculture Industry']
['% Employed in Info/Finance Industry']
R^2: 0.08908539414462846
Root Mean Square: 0.021487814567928826
['% Employed in Agriculture Industry']
['% Employed in Education/Health Industry']
R^2: 0.04215389970473615
Root Mean Square: 0.01690474099536352
['% Employed in Agriculture Industry']
['% Employed in Entertainment Industry']
R^2: 0.03603316497328357
Root Mean Square: 0.018899675606970527
['% Employe

R^2: 1.0
Root Mean Square: 9.114786389946357e-18
['% Employed in Entertainment Industry']
['% Employed in Service Industry']
R^2: 1.3222885820396435e-05
Root Mean Square: 0.007689624571835223
['% Employed in Entertainment Industry']
['% Employed in Military/Public Industry']
R^2: 0.04037049009203897
Root Mean Square: 0.014265721788844114
['% Employed in Service Industry']
['% Employed in Agriculture Industry']
R^2: 0.0015894949391872526
Root Mean Square: 0.013909120193233811
['% Employed in Service Industry']
['% Employed in Mining/Construction Industry']
R^2: 0.0004904051940934906
Root Mean Square: 0.01715461463756807
['% Employed in Service Industry']
['% Employed in Manufacturing Industry']
R^2: 0.09226437345002514
Root Mean Square: 0.025076262484281558
['% Employed in Service Industry']
['% Employed in Trade Industry']
R^2: 0.062302169185463274
Root Mean Square: 0.019726611314054857
['% Employed in Service Industry']
['% Employed in Info/Finance Industry']
R^2: 0.059999490940765066

In [128]:
# finding the R^2 and rms between variables in group of education
for col1 in education_new:
    for col2 in education_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% Less than High School']
['% Less than High School']
R^2: 1.0
Root Mean Square: 6.445127265397921e-18
['% Less than High School']
['% High School Diploma']
R^2: 0.11146564019475358
Root Mean Square: 0.03528514189045454
['% Less than High School']
['% College Grad']
R^2: 0.3612563154186226
Root Mean Square: 0.03504533742503137
['% High School Diploma']
['% Less than High School']
R^2: 0.11146564019475369
Root Mean Square: 0.037210538840139265
['% High School Diploma']
['% High School Diploma']
R^2: 1.0
Root Mean Square: 0.0
['% High School Diploma']
['% College Grad']
R^2: 0.2972043256455723
Root Mean Square: 0.036760505611764145
['% College Grad']
['% Less than High School']
R^2: 0.3612563154186228
Root Mean Square: 0.03154947634402731
['% College Grad']
['% High School Diploma']
R^2: 0.2972043256455722
Root Mean Square: 0.031381181788180564
['% College Grad']
['% College Grad']
R^2: 1.0
Root Mean Square: 3.958760270690572e-17


In [129]:
# finding the R^2 and rms between variables in group of language
for col1 in language_new:
    for col2 in language_new:
        print(col1)
        print(col2)
        perform_linear(col1,col2)

['% No English Speaking Adults in HH']
['% No English Speaking Adults in HH']
R^2: 1.0
Root Mean Square: 2.608992264607233e-17
['% No English Speaking Adults in HH']
['% English Spoken in HH']
R^2: 0.9382110471589412
Root Mean Square: 0.01481727178959659
['% No English Speaking Adults in HH']
['% Spanish Spoken in HH']
R^2: 0.847161959385834
Root Mean Square: 0.01805990758291089
['% No English Speaking Adults in HH']
['% Chinese Spoken in HH']
R^2: 0.1855667365120549
Root Mean Square: 0.005146864646606655
['% No English Speaking Adults in HH']
['% Korean Spoken in HH']
R^2: 0.22425866956883744
Root Mean Square: 0.002072380635640463
['% No English Speaking Adults in HH']
['% Vietnamese Spoken in HH']
R^2: 0.028820710782052616
Root Mean Square: 0.0023187865169133712
['% No English Speaking Adults in HH']
['% Tagalog Spoken in HH']
R^2: 0.002202815624005927
Root Mean Square: 0.0013849559276427716
['% No English Speaking Adults in HH']
['% Russian Spoken in HH']
R^2: 0.14315744567374855
Ro

R^2: 0.06434104246279104
Root Mean Square: 0.044684672854796596
['% Other Language Spoken in HH']
['% Chinese Spoken in HH']
R^2: 0.159558672105102
Root Mean Square: 0.005228398674132208
['% Other Language Spoken in HH']
['% Korean Spoken in HH']
R^2: 0.028014268389811803
Root Mean Square: 0.0023197490475883546
['% Other Language Spoken in HH']
['% Vietnamese Spoken in HH']
R^2: 0.002652238556643205
Root Mean Square: 0.0023498187716127984
['% Other Language Spoken in HH']
['% Tagalog Spoken in HH']
R^2: 0.19479948798158409
Root Mean Square: 0.0012441330798523248
['% Other Language Spoken in HH']
['% Russian Spoken in HH']
R^2: 0.04774029805075086
Root Mean Square: 0.0013529835595662652
['% Other Language Spoken in HH']
['% Other Language Spoken in HH']
R^2: 1.0
Root Mean Square: 8.539931814369215e-18


## Run Linear Regression Between Every Potential Variable and the Uninsured Percentage as a Reference for Dropping Variables

In [130]:
# define a function to perform linear regression between given columns of data
# uninsured percentage
def perform_linear(cols):
    x = states[cols]
    y = states[['Percent Uninsured']]
    lr = LinearRegression()
    reg = lr.fit(x, y)
    y_pred = lr.predict(x)
    
    r2 = reg.score(x, y)
    rms = np.sqrt(mean_squared_error(y, y_pred))
    print('R^2:', r2)
    print('Root Mean Square:', rms)

In [131]:
states.columns

Index(['State Name', 'Total Non-Elderly Population (Excluding Undocumented)',
       'Uninsured Population (Excluding Undocumented)', 'Percent Uninsured',
       '% HIU Income < 100% FPL', '% HIU Income 100-138% FPL',
       '% HIU Income 139-249% FPL', '% HIU Income 250-400% FPL',
       '% HIU Income 400% FPL', '% Age 0-18', '% Age 19-34', '% Age 35-49',
       '% Age 50-64', '% Male', '% Female', '% Married', '% Child in Family',
       '% Spanish/Hispanic/Latino Origin', '% White Non-Latino',
       '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander',
       '% American Indian / Alaska Native', '% Multi-racial or Other',
       '% SNAP Recipient', '% With a disability',
       '% Full-time Worker in Family', '% Employed in Agriculture Industry',
       '% Employed in Mining/Construction Industry',
       '% Employed in Manufacturing Industry', '% Employed in Trade Industry',
       '% Employed in Info/Finance Industry',
       '% Employed in Education/Health Industry',

In [132]:
# find the correlation factor between the income group and uninsured percentage
perform_linear(['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL', '% HIU Income 139-249% FPL', 
                 '% HIU Income 250-400% FPL', '% HIU Income 400% FPL'])

R^2: 0.4221985104899899
Root Mean Square: 0.027262717077655534


In [133]:
# find the correlation factor between the age group and uninsured percentage
perform_linear(['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64'])

R^2: 0.20333064378569965
Root Mean Square: 0.032012446399370634


In [134]:
# find the correlation factor between the gender group and uninsured percentage
perform_linear(['% Male', '% Female'])

R^2: 0.36443606826636854
Root Mean Square: 0.02859298060064303


In [135]:
# find the correlation factor between the marriage status and uninsured percentage
perform_linear(['% Married'])

R^2: 0.048810559182319024
Root Mean Square: 0.03497947753023497


In [136]:
# find the correlation factor between the pencentage of having child and uninsured percentage
perform_linear(['% Child in Family'])

R^2: 0.1466018640784159
Root Mean Square: 0.03313261034754693


In [137]:
# find the correlation factor between the ethnicity group and uninsured percentage
perform_linear(['% Spanish/Hispanic/Latino Origin', '% White Non-Latino',
       '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander',
       '% American Indian / Alaska Native', '% Multi-racial or Other'])

R^2: 0.16940226706339923
Root Mean Square: 0.0326870087134815


In [138]:
# find the correlation factor between the pencentage of having full time worker in family and uninsured percentage
perform_linear(['% Full-time Worker in Family'])

R^2: 0.0421275294009793
Root Mean Square: 0.035102144846991


In [139]:
# find the correlation factor between the Employment group and uninsured percentage
perform_linear(['% Employed in Agriculture Industry',
       '% Employed in Mining/Construction Industry',
       '% Employed in Manufacturing Industry', '% Employed in Trade Industry',
       '% Employed in Info/Finance Industry',
       '% Employed in Education/Health Industry',
       '% Employed in Entertainment Industry',
       '% Employed in Service Industry',
       '% Employed in Military/Public Industry'])

R^2: 0.5391225253083505
Root Mean Square: 0.02434851894530684


In [140]:
# find the correlation factor between the education group and uninsured percentage
perform_linear(['% Less than High School',
       '% High School Diploma', '% College Grad'])

R^2: 0.3829031600071472
Root Mean Square: 0.02817451633504088


In [141]:
# find the correlation factor between the language spoken group and uninsured percentage
perform_linear(['% No English Speaking Adults in HH', '% English Spoken in HH',
       '% Spanish Spoken in HH', '% Chinese Spoken in HH',
       '% Korean Spoken in HH', '% Vietnamese Spoken in HH',
       '% Tagalog Spoken in HH', '% Russian Spoken in HH',
       '% Other Language Spoken in HH'])

R^2: 0.48412591314096665
Root Mean Square: 0.025760344382349424


In [142]:
# finding the R^2 and rms between variables in group of income and uninsured percentage
for col1 in income_new:
    print(col1)
    perform_linear(col1)

['% HIU Income < 100% FPL']
R^2: 0.20604379560063324
Root Mean Square: 0.03195788882100485
['% HIU Income 100-138% FPL']
R^2: 0.2296670146886267
Root Mean Square: 0.031478864297498015
['% HIU Income 139-249% FPL']
R^2: 0.016659372874680556
Root Mean Square: 0.03556573591409746
['% HIU Income 250-400% FPL']
R^2: 0.14376937319444383
Root Mean Square: 0.03318754957804502
['% HIU Income 400% FPL']
R^2: 0.40067513390574194
Root Mean Square: 0.02776584894420926


In [143]:
# finding the R^2 and rms between variables in group of age and uninsured percentage
for col1 in age_new:
    print(col1)
    perform_linear(col1)

['% Age 0-18']
R^2: 0.025007581634850395
Root Mean Square: 0.03541444396380867
['% Age 19-34']
R^2: 0.07617578106720646
Root Mean Square: 0.034472634906504415
['% Age 35-49']
R^2: 0.037135734605451276
Root Mean Square: 0.03519349051990882
['% Age 50-64']
R^2: 0.008880584036934036
Root Mean Square: 0.03570613150223387


In [144]:
# finding the R^2 and rms between variables in group of origin and uninsured percentage
for col1 in origin_new:
    print(col1)
    perform_linear(col1)

['% Spanish/Hispanic/Latino Origin']
R^2: 0.007680848836102205
Root Mean Square: 0.035727735834839694
['% White Non-Latino']
R^2: 0.00406657514322728
Root Mean Square: 0.0357927413567434
['% Black Non-Latino']
R^2: 0.0005099745304933956
Root Mean Square: 0.03585659453895806
['% Asian / Native-Hawaiian / Pac Islander']
R^2: 0.08771440360134886
Root Mean Square: 0.0342566757639419
['% American Indian / Alaska Native']
R^2: 0.06237377386318921
Root Mean Square: 0.034729192164349686
['% Multi-racial or Other']
R^2: 0.00940401407410818
Root Mean Square: 0.035696701695044845


In [145]:
# finding the R^2 and rms between variables in group of employment and uninsured percentage
for col1 in employment_new:
    print(col1)
    perform_linear(col1)

['% Employed in Agriculture Industry']
R^2: 0.0008473308311959249
Root Mean Square: 0.03585054271819083
['% Employed in Mining/Construction Industry']
R^2: 0.08731865636197067
Root Mean Square: 0.03426410518899779
['% Employed in Manufacturing Industry']
R^2: 0.012038949630173712
Root Mean Square: 0.03564919436565927
['% Employed in Trade Industry']
R^2: 0.03092646476493588
Root Mean Square: 0.03530678515233211
['% Employed in Info/Finance Industry']
R^2: 0.24614344952145473
Root Mean Square: 0.031140398404683894
['% Employed in Education/Health Industry']
R^2: 0.16393293019668498
Root Mean Square: 0.03279445117169201
['% Employed in Entertainment Industry']
R^2: 0.030868231481453345
Root Mean Square: 0.03530784595890004
['% Employed in Service Industry']
R^2: 0.0007773266742802099
Root Mean Square: 0.03585179860387155
['% Employed in Military/Public Industry']
R^2: 0.019375764743367996
Root Mean Square: 0.03551657833585378


In [146]:
# finding the R^2 and rms between variables in group of education and uninsured percentage
for col1 in education_new:
    print(col1)
    perform_linear(col1)

['% Less than High School']
R^2: 0.07186415405549973
Root Mean Square: 0.034552985757336216
['% High School Diploma']
R^2: 0.19006744683928445
Root Mean Square: 0.032277822977180654
['% College Grad']
R^2: 0.3603637066937655
Root Mean Square: 0.02868443877001768


In [147]:
# finding the R^2 and rms between variables in group of language and uninsured percentage
for col1 in language_new:
    print(col1)
    perform_linear(col1)

['% No English Speaking Adults in HH']
R^2: 0.015077499325028332
Root Mean Square: 0.03559433123872854
['% English Spoken in HH']
R^2: 0.052885802790135106
Root Mean Square: 0.034904464657921616
['% Spanish Spoken in HH']
R^2: 6.280687325999068e-06
Root Mean Square: 0.03586562838151135
['% Chinese Spoken in HH']
R^2: 0.16944075846677276
Root Mean Square: 0.03268625131711238
['% Korean Spoken in HH']
R^2: 0.031085615270357803
Root Mean Square: 0.035303885824664136
['% Vietnamese Spoken in HH']
R^2: 0.011364994468412104
Root Mean Square: 0.035661351657948204
['% Tagalog Spoken in HH']
R^2: 0.027375215853498114
Root Mean Square: 0.035371418288289026
['% Russian Spoken in HH']
R^2: 0.027375215853498114
Root Mean Square: 0.035371418288289026
['% Other Language Spoken in HH']
R^2: 0.26010058182825735
Root Mean Square: 0.030850780159040886


# New Model after Removing Colinear Variables

In [148]:
# define a function to calculate the linear regression coefficients
def get_coef_from_linear(cols):
    x = states[cols]
    y = states[['Percent Uninsured']]
    lr = LinearRegression()
    reg = lr.fit(x, y)
    y_pred = lr.predict(x)

    r2 = reg.score(x, y)
    rms = np.sqrt(mean_squared_error(y, y_pred))
    print('R^2:', r2)
    print('Root Mean Square:', rms)
    
    print('Intercept:', reg.intercept_)
    
    df = pd.DataFrame(data=[[cols[i], reg.coef_[0][i]] for i in range(len(reg.coef_[0]))])
    return df

In [149]:
# re-define groups after removing colinear variables
income_removed = ['% HIU Income < 100% FPL', '% HIU Income 100-138% FPL',
           '% HIU Income 139-249% FPL', '% HIU Income 400% FPL']
age_removed = ['% Age 0-18', '% Age 19-34', '% Age 35-49', '% Age 50-64']
gender_removed = ['% Female']
origin_removed = ['% Spanish/Hispanic/Latino Origin', '% White Non-Latino',
          '% Black Non-Latino', '% Asian / Native-Hawaiian / Pac Islander',
          '% American Indian / Alaska Native', '% Multi-racial or Other']
employment_removed = ['% Employed in Agriculture Industry', 
             '% Employed in Mining/Construction Industry','% Employed in Manufacturing Industry', 
             '% Employed in Trade Industry','% Employed in Info/Finance Industry', 
             '% Employed in Education/Health Industry','% Employed in Entertainment Industry',
             '% Employed in Service Industry','% Employed in Military/Public Industry']
education_removed = ['% Less than High School', '% High School Diploma', '% College Grad']
language_removed = ['% English Spoken in HH',
            '% Spanish Spoken in HH', '% Chinese Spoken in HH',
            '% Korean Spoken in HH', '% Vietnamese Spoken in HH',
            '% Tagalog Spoken in HH', '% Russian Spoken in HH',
            '% Other Language Spoken in HH']
new_cols_removed = [income, age, gender, '% Married', origin,
          '% SNAP Recipient', '% With a disability', employment, education, 
            language]

In [150]:
# group all the significant variables
all_cols = [income_removed, age_removed, gender_removed, ['% Married'], origin_removed,
          ['% SNAP Recipient'], ['% With a disability'], employment_removed, education_removed, 
            language_removed]
all_cols = [item for sublist in all_cols for item in sublist]
all_cols

['% HIU Income < 100% FPL',
 '% HIU Income 100-138% FPL',
 '% HIU Income 139-249% FPL',
 '% HIU Income 400% FPL',
 '% Age 0-18',
 '% Age 19-34',
 '% Age 35-49',
 '% Age 50-64',
 '% Female',
 '% Married',
 '% Spanish/Hispanic/Latino Origin',
 '% White Non-Latino',
 '% Black Non-Latino',
 '% Asian / Native-Hawaiian / Pac Islander',
 '% American Indian / Alaska Native',
 '% Multi-racial or Other',
 '% SNAP Recipient',
 '% With a disability',
 '% Employed in Agriculture Industry',
 '% Employed in Mining/Construction Industry',
 '% Employed in Manufacturing Industry',
 '% Employed in Trade Industry',
 '% Employed in Info/Finance Industry',
 '% Employed in Education/Health Industry',
 '% Employed in Entertainment Industry',
 '% Employed in Service Industry',
 '% Employed in Military/Public Industry',
 '% Less than High School',
 '% High School Diploma',
 '% College Grad',
 '% English Spoken in HH',
 '% Spanish Spoken in HH',
 '% Chinese Spoken in HH',
 '% Korean Spoken in HH',
 '% Vietnamese

In [151]:
# get the final multiple linear regression
final = get_coef_from_linear(all_cols)
final

R^2: 0.9648518258102219
Root Mean Square: 0.006724054009023251
Intercept: [1.07453501]


Unnamed: 0,0,1
0,% HIU Income < 100% FPL,0.210882
1,% HIU Income 100-138% FPL,0.250354
2,% HIU Income 139-249% FPL,0.115122
3,% HIU Income 400% FPL,0.192168
4,% Age 0-18,-0.921744
5,% Age 19-34,-0.766777
6,% Age 35-49,-0.46433
7,% Age 50-64,-0.963077
8,% Female,0.794246
9,% Married,-0.883973


In [152]:
# print the coefficients
for n in range(len(final[1])):
    print(final[0][n], ':', final[1][n])

% HIU Income < 100% FPL : 0.21088199130521942
% HIU Income 100-138% FPL : 0.2503542393392726
% HIU Income 139-249% FPL : 0.11512221448955885
% HIU Income 400% FPL : 0.19216815332913123
% Age 0-18 : -0.9217438778016305
% Age 19-34 : -0.7667765625190943
% Age 35-49 : -0.46432986344149435
% Age 50-64 : -0.9630773744648873
% Female : 0.7942455539599091
% Married : -0.8839733624034881
% Spanish/Hispanic/Latino Origin : -0.08216709644578778
% White Non-Latino : -0.12226879228565299
% Black Non-Latino : -0.11828706174981518
% Asian / Native-Hawaiian / Pac Islander : -1.3067528404648332
% American Indian / Alaska Native : -0.20431598385302385
% Multi-racial or Other : 0.36951554568128536
% SNAP Recipient : -0.09012319197845818
% With a disability : -0.6616162201245839
% Employed in Agriculture Industry : 0.7151230268431186
% Employed in Mining/Construction Industry : 0.007897007005530296
% Employed in Manufacturing Industry : -0.5276062705491094
% Employed in Trade Industry : -0.09109316716182

In [153]:
# calculate the lienar regression coefficients for visulization
get_coef_from_linear(income_removed)

R^2: 0.4217381755315335
Root Mean Square: 0.027273575030872167
Intercept: [-0.02001716]


Unnamed: 0,0,1
0,% HIU Income < 100% FPL,0.151392
1,% HIU Income 100-138% FPL,0.315114
2,% HIU Income 139-249% FPL,0.276208
3,% HIU Income 400% FPL,-0.228329


In [154]:
# calculate the lienar regression coefficients for visulization
get_coef_from_linear(age_removed)

R^2: 0.20333064378569965
Root Mean Square: 0.032012446399370634
Intercept: [1.11330287]


Unnamed: 0,0,1
0,% Age 0-18,-0.87231
1,% Age 19-34,-1.259339
2,% Age 35-49,-0.553394
3,% Age 50-64,-1.277682


In [155]:
# calculate the lienar regression coefficients for visulization
get_coef_from_linear(gender_removed)

R^2: 0.36398023810200564
Root Mean Square: 0.028603232289183336
Intercept: [-0.17462981]


Unnamed: 0,0,1
0,% Female,0.628577


In [156]:
# calculate the lienar regression coefficients for visulization
get_coef_from_linear(origin_removed)

R^2: 0.16940226706339923
Root Mean Square: 0.0326870087134815
Intercept: [0.28952575]


Unnamed: 0,0,1
0,% Spanish/Hispanic/Latino Origin,-0.16295
1,% White Non-Latino,-0.214496
2,% Black Non-Latino,-0.181228
3,% Asian / Native-Hawaiian / Pac Islander,-0.421414
4,% American Indian / Alaska Native,-0.106829
5,% Multi-racial or Other,-0.023982


In [157]:
# calculate the lienar regression coefficients for visulization
get_coef_from_linear(employment_removed)

R^2: 0.5391225253083505
Root Mean Square: 0.02434851894530684
Intercept: [0.33103727]


Unnamed: 0,0,1
0,% Employed in Agriculture Industry,-0.351561
1,% Employed in Mining/Construction Industry,0.18481
2,% Employed in Manufacturing Industry,-0.329209
3,% Employed in Trade Industry,-0.210847
4,% Employed in Info/Finance Industry,-0.906209
5,% Employed in Education/Health Industry,-0.879803
6,% Employed in Entertainment Industry,-0.087256
7,% Employed in Service Industry,0.299292
8,% Employed in Military/Public Industry,-0.24975


In [158]:
# calculate the lienar regression coefficients for visulization 
get_coef_from_linear(education_removed)

R^2: 0.3829031600071472
Root Mean Square: 0.02817451633504088
Intercept: [-0.48389246]


Unnamed: 0,0,1
0,% Less than High School,0.542682
1,% High School Diploma,0.686892
2,% College Grad,0.122309
