In [51]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler


In [3]:
df = pd.read_csv("./data/PLACES__County_Data__GIS_Friendly_Format___2023_release_20231107_filtered.csv")

filtered_col_names = df.columns

wo_fl = df[df["StateDesc"] != "Florida"]
clean_df = wo_fl[wo_fl["StateDesc"] != "District of Columbia"]

extra_data = pd.read_csv('./data/MainSheet-Table 1.csv')

In [42]:
# Function to calculate weighted average
def weighted_average(group):
    weights = group['TotalPopulation']
    exclude_cols = ['StateAbbr', 'StateDesc', 'CountyName', 'CountyFIPS', 'Geolocation']
    weighted_values = group.drop(exclude_cols, axis=1).multiply(weights, axis=0)
    weighted_sum = weighted_values.sum()
    total_weights = weights.sum()
    return weighted_sum / total_weights

# Grouping by 'StateDesc' and calculating weighted averages for other columns
state_df = clean_df.groupby('StateDesc').apply(weighted_average).reset_index()

merged_data = pd.merge(state_df, extra_data, on='StateDesc')

In [43]:
new_column_names = {
    'ACCESS2_AdjPrev': 'Health Insurance', 
    'ARTHRITIS_AdjPrev': 'Arthritis', 
    'BINGE_AdjPrev': 'Binge Drinking',
    'BPHIGH_AdjPrev': 'High Blood Pressure',
    'BPMED_AdjPrev': 'Blood Pressure Medication',
    'CANCER_AdjPrev': 'Cancer',
    'CASTHMA_AdjPrev': 'Asthma',
    'CERVICAL_AdjPrev': 'Cervical Cancer Screeening',
    'CHD_AdjPrev': 'Coronary Heart Disease',
    'CHECKUP_AdjPrev': 'Checkup Last Year',
    'CHOLSCREEN_AdjPrev': 'Cholesterol Screening',
    'COLON_SCREEN_AdjPrev': 'Colorectal Cancer Screening',
    'COPD_AdjPrev': 'Chronic Obstructive Pulmonary Disease',
    'CSMOKING_AdjPrev': 'Smoking',
    'DENTAL_AdjPrev': 'Dental Visits',
    'DEPRESSION_AdjPrev': 'Depression',
    'DIABETES_AdjPrev': 'Diabetes',
    'GHLTH_AdjPrev': 'Poor Self Rated Health Status',
    'HIGHCHOL_AdjPrev': 'High Cholesterol',
    'KIDNEY_AdjPrev': 'Chronic Kidney Disease',
    'LPA_AdjPrev':'No leisure-time Physical Activity',
    'MAMMOUSE_AdjPrev': 'Mammography',
    'MHLTH_AdjPrev': 'Bad Mental Health',
    'OBESITY_AdjPrev': 'Obesity',
    'PHLTH_AdjPrev': 'Bad Physical Health',
    'SLEEP_AdjPrev': 'Sleep < 7h',
    'STROKE_AdjPrev': 'Stroke',
    'TEETHLOST_AdjPrev': 'All Teeth Lost',
    'HEARING_AdjPrev': 'Hearing Disability',
    'VISION_AdjPrev': 'Vision Disability',
    'COGNITION_AdjPrev': 'Cognative Disability',
    'MOBILITY_AdjPrev': 'Mobility Disability',
    'SELFCARE_AdjPrev': 'Self-care Disability',
    'INDEPLIVE_AdjPrev': 'Independent Living Disability',
    'DISABILITY_AdjPrev': 'Disability',
    'workout': 'Excercising',
    'mcdonalds': 'McDonalds',
    'burgerking': 'Burger King',
    'tacobell': 'Taco Bell', 
    'dominos': 'Dominos',
    'kfc': 'KFC',
    'gdp_22': 'GDP'
    }

renamed_df = merged_data.rename(columns=new_column_names)

In [44]:
renamed_df.head(10)

Unnamed: 0,StateDesc,TotalPopulation,Health Insurance,Arthritis,Binge Drinking,High Blood Pressure,Blood Pressure Medication,Cancer,Asthma,Cervical Cancer Screeening,...,Disability,Excercising,all,McDonalds,Burger King,Taco Bell,Dominos,KFC,mc_tb_do_kc,GDP
0,Alabama,234423.0,11.605011,27.987731,15.171591,37.711593,65.58056,6.202861,10.459797,84.318671,...,32.117277,19.3,81.7,5.0,3.5,2.8,2.5,1.8,15.6,277817500000.0
1,Alaska,150635.3,11.277589,21.815086,18.469314,28.930311,51.505752,6.117068,9.611573,78.251772,...,28.264936,27.9,61.9,4.2,1.1,2.1,1.4,0.8,9.6,63618000000.0
2,Arizona,2987373.0,13.561306,21.739753,17.378782,27.967238,54.445203,6.147776,10.221121,80.350514,...,28.27399,26.3,67.9,3.9,2.5,2.5,1.6,0.8,11.3,458949800000.0
3,Arkansas,141964.8,10.92048,26.209824,14.524275,36.690945,61.507253,6.24997,9.872488,81.238643,...,32.925307,15.7,69.9,5.7,2.7,3.5,2.2,2.0,16.1,165220600000.0
4,California,3696563.0,11.306139,18.789892,15.71359,26.952868,53.577848,5.410313,9.238837,81.245688,...,26.039552,24.0,82.3,3.2,1.4,2.1,1.4,1.1,9.2,3598103000000.0
5,Colorado,458241.8,11.609793,22.267849,19.362209,24.6995,52.45679,6.101291,10.168817,83.398796,...,23.168382,32.5,75.7,3.6,2.0,2.6,2.3,1.2,11.7,484371500000.0
6,Connecticut,732540.9,8.08182,21.182967,16.123081,27.601684,58.596159,6.095224,10.838628,86.281854,...,23.169974,24.5,76.1,4.0,1.8,1.4,1.5,1.0,9.7,321844600000.0
7,Delaware,420607.2,9.827574,22.317041,15.449247,31.512724,60.944992,6.078404,10.204954,82.110843,...,25.92306,20.1,78.3,3.7,2.1,1.5,2.7,1.1,11.1,87524800000.0
8,Georgia,383339.2,15.917432,23.569418,15.183028,35.656997,62.452882,5.861034,10.019702,82.828554,...,29.148328,20.2,82.5,4.2,2.6,2.3,2.1,1.5,12.7,755697900000.0
9,Hawaii,745942.5,5.884432,17.781767,18.651305,25.83797,56.479427,5.212355,8.771138,79.655093,...,22.280322,24.4,97.5,5.2,2.0,2.1,1.6,1.1,12.0,98218800000.0


In [57]:
# Splitting the data into training and testing sets (80% train, 20% test)
data = renamed_df.drop(["StateDesc"], axis=1)

#X = data.drop(["Obesity"], axis=1)
y = data["Obesity"]

X = data[["TotalPopulation", "GDP", "Excercising", "McDonalds", "Burger King", "Binge Drinking", "Health Insurance", "Smoking"]]

#  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

columns_to_normalize = ['GDP', 'TotalPopulation']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected columns
X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])

# Creating a linear regression model
model = LinearRegression()

# Fitting the model with the training data
model.fit(X_train.drop(["TotalPopulation"], axis=1), y_train, sample_weight=X_train["TotalPopulation"])

X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])


y_pred = model.predict(X_test.drop(["TotalPopulation"], axis=1))

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 8.994670012178682
R-squared (R2): 0.1879802855245113


In [58]:
model.coef_

array([ 0.52475443, -0.04648743,  1.28710788,  0.66874474, -0.53694662,
        0.22423256,  0.83542443])

In [65]:
# Splitting the data into training and testing sets (80% train, 20% test)
new_column_names = {
        'BINGE_AdjPrev': 'Binge Drinking',
        'ACCESS2_AdjPrev': 'Health Insurance', 
        'GHLTH_AdjPrev': 'Poor Self Rated Health Status',
        'OBESITY_AdjPrev': 'Obesity',
        'MAMMOUSE_AdjPrev': 'Mammography',
        'CSMOKING_AdjPrev': 'Smoking',
        'DENTAL_AdjPrev': 'Dental Visits',
}

clean_df_renamed = clean_df.rename(columns=new_column_names)

data = clean_df_renamed

#X = data.drop(["Obesity"], axis=1)
y = data["Obesity"]

X = data[["TotalPopulation", "Binge Drinking", "Health Insurance", "Smoking", 'Poor Self Rated Health Status', 'Mammography', 'Dental Visits']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

columns_to_normalize = ['TotalPopulation']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected columns
X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])

# Creating a linear regression model
model = LinearRegression()

# Fitting the model with the training data
model.fit(X_train.drop(["TotalPopulation"], axis=1), y_train, sample_weight=X_train["TotalPopulation"])

X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])


y_pred = model.predict(X_test.drop(["TotalPopulation"], axis=1))

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

model.coef_

Mean Squared Error (MSE): 12.749771411184357
R-squared (R2): 0.3906111504530343


array([-0.06798716,  0.3116351 ,  1.07786225,  0.17421921,  0.05294744,
        0.16113867])