In [18]:
# Import Dependencies
import pandas as pd
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

## Connect to Database, query data with a join.

In [19]:
# Connect to SQLite dB
con = sqlite3.connect('/Users/jennadodge/uofo-virt-data-pt-12-2021-u-b/Water_Quality_Analysis/Database/database.sqlite3')

# Create a cursor object
cur = con.cursor()

In [20]:
# Retrieve data

df = pd.read_sql_query("SELECT * FROM Census_Data INNER JOIN Contaminant_Summary on Census_Data.county_FIPS = Contaminant_Summary.county_FIPS",con)
df.head()

Unnamed: 0,county_FIPS,Geographic_Area_Name,County,GEOID,Total_Population,White,%_White,Black,%_Black,Native,...,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,county_FIPS.1,Num_Contaminants,Sum_ContaminantFactor,Min_Contaminant_Factor,Max_Contaminant_Factor,Avg_Contaminant_Factor
0,1001,"Autauga County, Alabama",Autauga County,0500000US01001,58805,42160,0.716946,11446,0.194643,218,...,0.06941,0.986667,0.155018,0.4552,1001,6,551,27,148,91.83
1,1003,"Baldwin County, Alabama",Baldwin County,0500000US01003,231767,189399,0.817196,18218,0.078605,1583,...,0.10348,0.868861,0.212231,0.4566,1003,9,1625,28,580,180.56
2,1005,"Barbour County, Alabama",Barbour County,0500000US01005,25223,11317,0.448678,11934,0.47314,117,...,0.112569,1.164057,0.226599,0.5047,1005,3,1414,138,1132,471.33
3,1007,"Bibb County, Alabama",Bibb County,0500000US01007,22293,16555,0.74261,4414,0.197999,61,...,0.064188,0.879973,0.145676,0.45,1007,9,2538,17,1895,282.0
4,1009,"Blount County, Alabama",Blount County,0500000US01009,59134,50663,0.856749,846,0.014306,338,...,0.176138,0.808008,0.319759,0.4685,1009,30,37191,18,5219,1239.7


In [21]:
df.shape

(443, 36)

In [22]:
con.close()

## Data Pre-processing for ML

In [23]:
df.columns

Index(['county_FIPS', 'Geographic_Area_Name', 'County', 'GEOID',
       'Total_Population', 'White', '%_White', 'Black', '%_Black', 'Native',
       '%_Native', 'Asian', '%_Asian', 'Pacific_Islander',
       '%_Pacific_Islander', 'Other', '%_Other', '2+_Races', '%_2+_Races',
       'Non-White', '%_Non-White', 'Hispanic', '%_Hispanic', 'Not_Hispanic',
       '%_Not_Hispanic', 'Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index', 'county_FIPS',
       'Num_Contaminants', 'Sum_ContaminantFactor', 'Min_Contaminant_Factor',
       'Max_Contaminant_Factor', 'Avg_Contaminant_Factor'],
      dtype='object')

In [24]:
# Drop duplicate county_FIPS column
df = df[['county_FIPS', 'Geographic_Area_Name', 'County', 'GEOID',
       'Total_Population', 'White', '%_White', 'Black', '%_Black', 'Native',
       '%_Native', 'Asian', '%_Asian', 'Pacific_Islander',
       '%_Pacific_Islander', 'Other', '%_Other', '2+_Races', '%_2+_Races',
       'Non-White', '%_Non-White', 'Hispanic', '%_Hispanic', 'Not_Hispanic',
       '%_Not_Hispanic', 'Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index',
       'Num_Contaminants', 'Sum_ContaminantFactor', 'Min_Contaminant_Factor',
       'Max_Contaminant_Factor', 'Avg_Contaminant_Factor']]

In [36]:
df2 = df.loc[:,['Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index','Sum_ContaminantFactor']]
df2.head()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
0,0.444747,0.06941,0.986667,0.155018,0.4552,551
1,0.321154,0.10348,0.868861,0.212231,0.4566,1625
2,0.572334,0.112569,1.164057,0.226599,0.5047,1414
3,0.407736,0.064188,0.879973,0.145676,0.45,2538
4,0.258546,0.176138,0.808008,0.319759,0.4685,37191


In [37]:
# Check datatypes to ensure they are numeric
df2.dtypes

Simpson_Race_DI          float64
Simpson_Ethnic_DI        float64
Shannon_Race_DI          float64
Shannon_Ethnic_DI        float64
Gini_Index               float64
Sum_ContaminantFactor      int64
dtype: object

In [38]:
df2.describe()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
count,443.0,443.0,443.0,443.0,443.0,443.0
mean,0.240718,0.073029,0.640573,0.151925,0.442437,14412.09
std,0.143954,0.065864,0.303843,0.101345,0.032996,107169.8
min,0.063408,0.011866,0.202798,0.036514,0.3348,0.0
25%,0.128125,0.032328,0.400065,0.083808,0.4189,2040.5
50%,0.186505,0.049063,0.550279,0.117508,0.4384,5163.0
75%,0.32022,0.089167,0.845691,0.188893,0.46335,12114.5
max,0.715154,0.441067,1.803719,0.632997,0.5509,2215481.0


In [40]:
# Transform data
# data_scaled = StandardScaler().fit_transform(df2)
# print(data_scaled[0:5])

# df2['Total_Population'] = df2['Total_Population']/100_000
df2['Sum_ContaminantFactor'] = df2['Sum_ContaminantFactor']/10_000
df2.head() 


Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
0,0.444747,0.06941,0.986667,0.155018,0.4552,0.0551
1,0.321154,0.10348,0.868861,0.212231,0.4566,0.1625
2,0.572334,0.112569,1.164057,0.226599,0.5047,0.1414
3,0.407736,0.064188,0.879973,0.145676,0.45,0.2538
4,0.258546,0.176138,0.808008,0.319759,0.4685,3.7191


# K means

In [41]:
# Initializing model with K = 3 
model = KMeans(n_clusters=3, random_state=13)
model

KMeans(n_clusters=3, random_state=13)

In [42]:
# Fitting model
model.fit(df2)

KMeans(n_clusters=3, random_state=13)

In [43]:
# Get predictions
predictions = model.predict(df2)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [44]:
# Add a new class column to df_iris
df2["class"] = model.labels_
df2.head()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor,class
0,0.444747,0.06941,0.986667,0.155018,0.4552,0.0551,1
1,0.321154,0.10348,0.868861,0.212231,0.4566,0.1625,1
2,0.572334,0.112569,1.164057,0.226599,0.5047,0.1414,1
3,0.407736,0.064188,0.879973,0.145676,0.45,0.2538,1
4,0.258546,0.176138,0.808008,0.319759,0.4685,3.7191,1


In [45]:
df2.hvplot.scatter(x="Simpson_Race_DI", y="Gini_Index", by="class")

In [None]:
# PCA

In [46]:
from sklearn.decomposition import PCA

In [47]:
# Transform data
data_scaled = StandardScaler().fit_transform(df2)
print(data_scaled[0:5])

[[ 1.41892358 -0.05501191  1.14034267  0.03054724  0.38723192 -0.12948393
   0.        ]
 [ 0.5593959   0.46285861  0.75218375  0.59572426  0.42970976 -0.11945112
   0.        ]
 [ 2.30622853  0.60099831  1.72482291  0.73765679  1.88912713 -0.12142219
   0.        ]
 [ 1.16153035 -0.13438746  0.78879673 -0.0617325   0.22945707 -0.1109223
   0.        ]
 [ 0.12398314  1.56725567  0.55167888  1.65794185  0.79077144  0.21278999
   0.        ]]


In [49]:
# Applying PCA to reduce dimensions from 4 to 2

# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the data.
data_pca = pca.fit_transform(data_scaled)

In [50]:
data_pca

array([[ 1.30355889e+00, -4.10697066e-01],
       [ 1.23194760e+00, -2.25214659e-01],
       [ 2.90473215e+00, -7.97024712e-01],
       [ 8.92589801e-01, -3.21167820e-01],
       [ 2.04605478e+00,  1.17271576e-01],
       [ 8.84136383e-01, -6.28677297e-01],
       [ 1.68588849e+00, -4.28816689e-01],
       [ 8.51998972e-01, -1.07382835e+00],
       [ 6.26556980e-01, -4.99559320e-01],
       [-1.06253669e+00, -1.89418471e-01],
       [ 1.14401190e+00, -4.32879033e-01],
       [-1.23635558e-01, -4.66022794e-01],
       [ 6.06828226e-01, -8.79122059e-01],
       [ 2.52853513e-02,  2.56250465e-01],
       [ 5.13755047e-02, -5.94410954e-01],
       [ 1.39181631e+00, -7.35295816e-01],
       [ 1.43792095e+00, -3.16812857e-01],
       [-8.72857748e-02, -2.54514881e-01],
       [ 4.84204959e+00,  2.07530764e-01],
       [ 6.86851728e-01, -8.56655756e-01],
       [ 1.94959030e+00, -6.19903382e-01],
       [ 2.80447139e-03, -1.20250501e-01],
       [ 2.79190629e+00, -7.67822252e-01],
       [-7.

In [51]:
# Transform PCA data to a DataFrame
df_pca = pd.DataFrame(
    data=data_pca, columns=["principal component 1", "principal component 2"]
)
df_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,1.303559,-0.410697
1,1.231948,-0.225215
2,2.904732,-0.797025
3,0.89259,-0.321168
4,2.046055,0.117272


In [52]:
pca.explained_variance_ratio_

array([0.4809383 , 0.22711596])

In [None]:
# Elbow