In [2440]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# warnings
import warnings
warnings.filterwarnings('ignore')

In [2441]:
main_df = pd.read_csv("/Users/JustinHolmes/Desktop/Staple Health/County/Concatenated_Data/US_County_All_Data_71719.csv",
                      header = 0,)


In [2442]:
main_df = maindf[["FIPS", "State", "County", "% with SNAP", 'Gini Index Estimate', 'Life Expectancy',
                 'Violent Crime Rate', 'Property Crime Rate', 'Percent Educated', '% Female', '% Non-Hispanic White',
                '% 65 and over', 'Household Income (In tens of thousands)', 'Mentally Unhealthy Days']]

In [2443]:
data = [main_df]
main_df.head()

Unnamed: 0,FIPS,State,County,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income (In tens of thousands),Mentally Unhealthy Days
0,1001,Alabama,Autauga,12.798077,45.01,76.3,42.0,485.0,87.700302,51.3,74.5,15.1,58.343,4.3
1,1003,Alabama,Baldwin,8.898322,46.18,78.6,47.0,358.0,90.213394,51.5,83.0,19.9,56.607,4.2
2,1005,Alabama,Barbour,25.356281,46.22,75.8,56.0,445.0,73.093197,47.2,46.0,18.8,32.49,4.6
3,1007,Alabama,Bibb,14.84109,45.18,73.9,58.0,244.0,82.136326,46.5,74.3,16.0,45.795,4.3
4,1009,Alabama,Blount,11.523353,43.02,74.6,5.0,132.0,79.784674,50.7,86.9,17.8,48.253,4.7


### Fix NaN

In [2444]:
main_df.isna().sum()

FIPS                                        0
State                                       0
County                                      0
% with SNAP                                 0
Gini Index Estimate                         0
Life Expectancy                            69
Violent Crime Rate                          8
Property Crime Rate                         8
Percent Educated                            0
% Female                                    0
% Non-Hispanic White                        0
% 65 and over                               0
Household Income (In tens of thousands)     0
Mentally Unhealthy Days                     0
dtype: int64

In [2445]:
freq_life_exp = main_df["Life Expectancy"].median()
freq_life_exp

77.5

In [2446]:
freq_violent_crime = main_df["Violent Crime Rate"].median()
freq_violent_crime

84.0

In [2447]:
freq_prop_crime = main_df["Property Crime Rate"].median()
freq_prop_crime

317.0

In [2448]:
for dataset in data:
    dataset["Life Expectancy"] = dataset["Life Expectancy"].fillna(freq_life_exp)
    dataset["Violent Crime Rate"] = dataset["Violent Crime Rate"].fillna(freq_violent_crime)
    dataset["Property Crime Rate"] = dataset["Property Crime Rate"].fillna(freq_prop_crime)

In [2449]:
main_df.isna().sum()

FIPS                                       0
State                                      0
County                                     0
% with SNAP                                0
Gini Index Estimate                        0
Life Expectancy                            0
Violent Crime Rate                         0
Property Crime Rate                        0
Percent Educated                           0
% Female                                   0
% Non-Hispanic White                       0
% 65 and over                              0
Household Income (In tens of thousands)    0
Mentally Unhealthy Days                    0
dtype: int64

## Create dataframe with only columns we want to look at

In [2450]:
var_df = main_df.drop(["FIPS","State","County"], axis=1)
var_df.rename(columns={'Household Income (In tens of thousands)':'Household Income'}, inplace=True)
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days
0,12.798077,45.01,76.3,42.0,485.0,87.700302,51.3,74.5,15.1,58.343,4.3
1,8.898322,46.18,78.6,47.0,358.0,90.213394,51.5,83.0,19.9,56.607,4.2
2,25.356281,46.22,75.8,56.0,445.0,73.093197,47.2,46.0,18.8,32.49,4.6
3,14.84109,45.18,73.9,58.0,244.0,82.136326,46.5,74.3,16.0,45.795,4.3
4,11.523353,43.02,74.6,5.0,132.0,79.784674,50.7,86.9,17.8,48.253,4.7


In [2451]:
var_df["Violent Crime Rate"] = var_df["Violent Crime Rate"].astype(int)
var_df["Property Crime Rate"] = var_df["Property Crime Rate"].astype(int)
var_df["% with SNAP"] = round(var_df["% with SNAP"], 1)
var_df["Percent Educated"] = round(var_df["Percent Educated"], 1)
var_df["Household Income"] = round(var_df["Household Income"], 3)


In [2452]:
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7


## Create bands to categorize data for analysis

In [2453]:
var_df["MHBand"] = pd.qcut(var_df["Mentally Unhealthy Days"], 4)

In [2454]:
var_df[["% with SNAP", "MHBand"]].groupby(["MHBand"],
        as_index=False).mean().sort_values(by='% with SNAP', ascending=True)


Unnamed: 0,MHBand,% with SNAP
0,"(2.399, 3.5]",8.178186
1,"(3.5, 3.9]",12.791086
2,"(3.9, 4.3]",16.24974
3,"(4.3, 6.0]",19.750575


In [2455]:
data = [var_df]
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days,MHBand
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3,"(3.9, 4.3]"
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2,"(3.9, 4.3]"
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6,"(4.3, 6.0]"
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3,"(3.9, 4.3]"
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7,"(4.3, 6.0]"


In [2456]:
for dataset in data:
    dataset.loc[dataset['Mentally Unhealthy Days'] <= 3.5, 'MHCat'] = 0
    dataset.loc[(dataset['Mentally Unhealthy Days'] > 3.5) & (dataset['Mentally Unhealthy Days'] <= 3.9), 'MHCat'] = 1
    dataset.loc[(dataset['Mentally Unhealthy Days'] > 3.9) & (dataset['Mentally Unhealthy Days'] <= 4.3), 'MHCat'] = 2
    dataset.loc[ dataset['Mentally Unhealthy Days'] > 4.3, 'MHCat'] = 3
    
    
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days,MHBand,MHCat
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3,"(3.9, 4.3]",2.0
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2,"(3.9, 4.3]",2.0
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6,"(4.3, 6.0]",3.0
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3,"(3.9, 4.3]",2.0
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7,"(4.3, 6.0]",3.0


In [2457]:
var_df["MHCat"] = var_df["MHCat"].astype(int)
var_df = var_df.drop(["MHBand"], axis=1)
data = [var_df]
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days,MHCat
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3,2
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2,2
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6,3
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3,2
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7,3


In [2458]:
var_df["SnapBand"] = pd.qcut(var_df["% with SNAP"], 4)
var_df[["SnapBand", "Mentally Unhealthy Days"]].groupby(["SnapBand"],
        as_index=False).mean().sort_values(by='Mentally Unhealthy Days', ascending=True)

Unnamed: 0,SnapBand,Mentally Unhealthy Days
0,"(-0.001, 9.3]",3.329077
1,"(9.3, 13.3]",3.802375
2,"(13.3, 17.9]",4.132812
3,"(17.9, 56.7]",4.474968


In [2459]:
for dataset in data:
    dataset.loc[dataset['% with SNAP'] <= 9.3, 'SnapCat'] = 0
    dataset.loc[(dataset['% with SNAP'] > 9.3) & (dataset['% with SNAP'] <= 13.3), 'SnapCat'] = 1
    dataset.loc[(dataset['% with SNAP'] > 13.3) & (dataset['% with SNAP'] <= 17.9), 'SnapCat'] = 2
    dataset.loc[dataset['% with SNAP'] > 17.9,'SnapCat'] = 3
    
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days,MHCat,SnapBand,SnapCat
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3,2,"(9.3, 13.3]",1.0
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2,2,"(-0.001, 9.3]",0.0
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6,3,"(17.9, 56.7]",3.0
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3,2,"(13.3, 17.9]",2.0
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7,3,"(9.3, 13.3]",1.0


In [2460]:
var_df["SnapCat"] = var_df["SnapCat"].astype(int)
var_df = var_df.drop(["SnapBand"], axis=1)
data = [var_df]
var_df.head()

Unnamed: 0,% with SNAP,Gini Index Estimate,Life Expectancy,Violent Crime Rate,Property Crime Rate,Percent Educated,% Female,% Non-Hispanic White,% 65 and over,Household Income,Mentally Unhealthy Days,MHCat,SnapCat
0,12.8,45.01,76.3,42,485,87.7,51.3,74.5,15.1,58.343,4.3,2,1
1,8.9,46.18,78.6,47,358,90.2,51.5,83.0,19.9,56.607,4.2,2,0
2,25.4,46.22,75.8,56,445,73.1,47.2,46.0,18.8,32.49,4.6,3,3
3,14.8,45.18,73.9,58,244,82.1,46.5,74.3,16.0,45.795,4.3,2,2
4,11.5,43.02,74.6,5,132,79.8,50.7,86.9,17.8,48.253,4.7,3,1


In [2462]:
# var_df["GiniBand"] = pd.qcut(var_df["Gini Index Estimate"], 4)
# var_df[["GiniBand", "Mentally Unhealthy Days"]].groupby(["GiniBand"],
#         as_index=False).mean().sort_values(by='GiniBand', ascending=True)

In [2463]:
# var_df["LifeBand"] = pd.qcut(var_df["Life Expectancy"], 4)
# var_df[["LifeBand", "Mentally Unhealthy Days"]].groupby(["LifeBand"],
#         as_index=False).mean().sort_values(by='LifeBand', ascending=True)

In [2464]:
# var_df["VCrimeBand"] = pd.qcut(var_df["Violent Crime Rate"], 4)
# var_df[["VCrimeBand", "Mentally Unhealthy Days"]].groupby(["VCrimeBand"],
#         as_index=False).mean().sort_values(by='VCrimeBand', ascending=True)

In [2465]:
# var_df["PCrimeBand"] = pd.qcut(var_df["Property Crime Rate"], 4)
# var_df[["PCrimeBand", "Mentally Unhealthy Days"]].groupby(["PCrimeBand"],
#         as_index=False).mean().sort_values(by='PCrimeBand', ascending=True)

In [2466]:
# var_df["EducatedBand"] = pd.qcut(var_df["Percent Educated"], 4)
# var_df[["EducatedBand", "Mentally Unhealthy Days"]].groupby(["EducatedBand"],
#         as_index=False).mean().sort_values(by='EducatedBand', ascending=True)

In [2467]:
# var_df["IncomeBand"] = pd.qcut(var_df["Household Income"], 4)
# var_df[["IncomeBand", "Mentally Unhealthy Days"]].groupby(["IncomeBand"],
#         as_index=False).mean().sort_values(by='IncomeBand', ascending=True)

In [2468]:
# var_df["WhiteBand"] = pd.qcut(var_df["% Non-Hispanic White"], 4)
# var_df[["WhiteBand", "Mentally Unhealthy Days"]].groupby(["WhiteBand"],
#         as_index=False).mean().sort_values(by='WhiteBand', ascending=True)

## Export dataframe to be uploaded to BigML

In [None]:
# exported_df = var_df.drop(["SnapCat", 'Mentally Unhealthy Days'], axis=1)
# exported_df.to_csv("/Users/JustinHolmes/Desktop/Staple Health/County/BigML_Data/MentalHealth.csv",  index=False)

## Check Correlations

In [2469]:
corr_df = pd.DataFrame()

clmn = list(var_df)

x_vars = [x for x in clmn if x != "MHCat" and x != "Mentally Unhealthy Days"]

corr_df["Variables"] = x_vars

corr_vals = []
for item in x_vars:
    corr_vals.append(var_df[item].corr(var_df['Mentally Unhealthy Days']))
corr_df["Correlation with Mentally Unhealthy Days"] = corr_vals

corr_df

Unnamed: 0,Variables,Correlation with Mentally Unhealthy Days
0,% with SNAP,0.690689
1,Gini Index Estimate,0.320514
2,Life Expectancy,-0.625053
3,Violent Crime Rate,0.213575
4,Property Crime Rate,0.211405
5,Percent Educated,-0.481003
6,% Female,0.168825
7,% Non-Hispanic White,-0.140397
8,% 65 and over,-0.021238
9,Household Income,-0.556586


## Observations

##### As SNAP increases, so does poor mental health
##### As Gini Index Increases, so does poor mental health
##### As education increases, poor mental health decreases
##### As household income increases, poor mental health decreases
##### Controls do not see significant change in mental health

## Split the data into training and testing sets

In [2470]:
#®X = var_df[["Percent Educated", "% Female","% Non-Hispanic White",'% 65 and over', 'Household Income','Percent Educated']]
X = var_df[x_vars]
y = var_df["MHCat"]

In [2471]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = None)

### Check to see if split worked

In [2472]:
# X_train.info()

In [2473]:
# X_test.info()

In [2474]:
#X_train.head()

In [2475]:
#y_train.head()

In [2476]:
#X_test.head()

### Normalize Data

In [2477]:
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)

# Creating Models

### Linear Regression

In [2478]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
# y_pred = linreg.predict(X_test)
acc_lin = round(linreg.score(X_test, y_test) * 100, 2)
acc_lin

53.91

### Logistic Regression

In [2479]:
logreg = LogisticRegression(solver = 'liblinear')
logreg.fit(X_train, y_train)
# y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2)
acc_log

53.26

### Support Vector Machines

In [2480]:
svc = SVC()
svc.fit(X_train, y_train)
# y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_test, y_test) * 100, 2)
acc_svc

27.66

### k-Nearest Neighbors

In [2481]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
# y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_test, y_test) * 100, 2)
acc_knn

42.45

### Gaussian Naive Bayes

In [2482]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
# y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_test, y_test) * 100, 2)
acc_gaussian

50.4

### Perceptron

In [2483]:
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
# y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron

31.48

### Linear SVC

In [2484]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
# y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_test, y_test) * 100, 2)
acc_linear_svc

25.76

### Stochastic Gradient Descent

In [2485]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
# y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_test, y_test) * 100, 2)
acc_sgd

34.98

### Decision Tree

In [2486]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
# y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree

51.99

### Random Forest

In [2487]:
random_forest = RandomForestClassifier(n_estimators=500)
random_forest.fit(X_train, y_train)
# y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest

58.51

## Model Evaluation

In [2488]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', "Linear Regression"],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree, acc_lin]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,58.51
9,Linear Regression,53.91
2,Logistic Regression,53.26
8,Decision Tree,51.99
4,Naive Bayes,50.4
1,KNN,42.45
6,Stochastic Gradient Decent,34.98
5,Perceptron,31.48
0,Support Vector Machines,27.66
7,Linear SVC,25.76
