In [10]:
# Import Dependencies
import pandas as pd
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

## Connect to Database, query data with a join.

In [11]:
# Connect to SQLite dB
con = sqlite3.connect('/Users/jennadodge/uofo-virt-data-pt-12-2021-u-b/Water_Quality_Analysis/Database/database.sqlite3')

# Create a cursor object
cur = con.cursor()

In [12]:
# Retrieve data

df = pd.read_sql_query("SELECT * FROM Census_Data INNER JOIN Contaminant_Summary on Census_Data.county_FIPS = Contaminant_Summary.county_FIPS",con)
df.head()

Unnamed: 0,county_FIPS,Geographic_Area_Name,County,GEOID,Total_Population,White,Black,Native,Asian,Pacific_Islander,...,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,County_FIPS,Num_Contaminants,Sum_Population_Served,Sum_ContaminantFactor,Min_Contaminant_Factor,Max_Contaminant_Factor,Avg_Contaminant_Factor
0,8069,"Larimer County, Colorado",Larimer County,0500000US08069,359066,295995,3816,2879,8539,326,...,0.948124,0.375569,0.4428,8069,4,264021,4744,984,1519,1186.0
1,10001,"Kent County, Delaware",Kent County,0500000US10001,181851,107685,46999,1150,4430,126,...,1.305087,0.271086,0.4191,10001,9,70580,4987,201,1395,554.11
2,10003,"New Castle County, Delaware",New Castle County,0500000US10003,570719,314231,146545,2157,35201,176,...,1.464587,0.349206,0.4607,10003,20,555453,15583,68,4732,779.15
3,10005,"Sussex County, Delaware",Sussex County,0500000US10005,237378,175847,25358,1844,3071,113,...,1.161943,0.352475,0.4561,10005,11,205901,5456,91,1295,496.0
4,16019,"Bonneville County, Idaho",Bonneville County,0500000US16019,123964,103736,627,1262,1424,155,...,0.919484,0.401124,0.4287,16019,1,2982,42,42,42,42.0


In [13]:
df.shape

(882, 37)

In [14]:
con.close()

## Data Pre-processing for ML

In [15]:
df.columns

Index(['county_FIPS', 'Geographic_Area_Name', 'County', 'GEOID',
       'Total_Population', 'White', 'Black', 'Native', 'Asian',
       'Pacific_Islander', 'Other', 'Two_or_more_Races', 'Hispanic',
       'Not_Hispanic', 'Not_White', 'pct_White', 'pct_Black', 'pct_Native',
       'pct_Asian', 'pct_Pacific_Islander', 'pct_Other', 'pct_Not_White',
       'pct_Hispanic', 'pct_Not_Hispanic', 'pct_Two_or_more_Races',
       'Simpson_Race_DI', 'Simpson_Ethnic_DI', 'Shannon_Race_DI',
       'Shannon_Ethnic_DI', 'Gini_Index', 'County_FIPS', 'Num_Contaminants',
       'Sum_Population_Served', 'Sum_ContaminantFactor',
       'Min_Contaminant_Factor', 'Max_Contaminant_Factor',
       'Avg_Contaminant_Factor'],
      dtype='object')

In [16]:
# Drop duplicate county_FIPS column
df.drop(columns = ['County_FIPS'], inplace=True)
df.head()

Unnamed: 0,county_FIPS,Geographic_Area_Name,County,GEOID,Total_Population,White,Black,Native,Asian,Pacific_Islander,...,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Num_Contaminants,Sum_Population_Served,Sum_ContaminantFactor,Min_Contaminant_Factor,Max_Contaminant_Factor,Avg_Contaminant_Factor
0,8069,"Larimer County, Colorado",Larimer County,0500000US08069,359066,295995,3816,2879,8539,326,...,0.217826,0.948124,0.375569,0.4428,4,264021,4744,984,1519,1186.0
1,10001,"Kent County, Delaware",Kent County,0500000US10001,181851,107685,46999,1150,4430,126,...,0.141942,1.305087,0.271086,0.4191,9,70580,4987,201,1395,554.11
2,10003,"New Castle County, Delaware",New Castle County,0500000US10003,570719,314231,146545,2157,35201,176,...,0.197811,1.464587,0.349206,0.4607,20,555453,15583,68,4732,779.15
3,10005,"Sussex County, Delaware",Sussex County,0500000US10005,237378,175847,25358,1844,3071,113,...,0.200263,1.161943,0.352475,0.4561,11,205901,5456,91,1295,496.0
4,16019,"Bonneville County, Idaho",Bonneville County,0500000US16019,123964,103736,627,1262,1424,155,...,0.237762,0.919484,0.401124,0.4287,1,2982,42,42,42,42.0


In [17]:
df2 = df.loc[:,['Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index','Sum_ContaminantFactor']]
df2.head()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
0,0.309413,0.217826,0.948124,0.375569,0.4428,4744
1,0.573395,0.141942,1.305087,0.271086,0.4191,4987
2,0.618709,0.197811,1.464587,0.349206,0.4607,15583
3,0.43091,0.200263,1.161943,0.352475,0.4561,5456
4,0.290288,0.237762,0.919484,0.401124,0.4287,42


In [18]:
# Check datatypes to ensure they are numeric
df2.dtypes

Simpson_Race_DI          float64
Simpson_Ethnic_DI        float64
Shannon_Race_DI          float64
Shannon_Ethnic_DI        float64
Gini_Index               float64
Sum_ContaminantFactor      int64
dtype: object

In [19]:
df2.describe()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
count,882.0,882.0,882.0,882.0,882.0,882.0
mean,0.280229,0.113581,0.753765,0.211073,0.441717,11755.39
std,0.166876,0.108514,0.373162,0.15222,0.033689,77113.64
min,0.063408,0.011866,0.202798,0.036514,0.3157,0.0
25%,0.146299,0.040314,0.457942,0.100287,0.417125,1966.5
50%,0.226679,0.069034,0.669062,0.15434,0.43945,4770.0
75%,0.384804,0.142876,0.980501,0.272424,0.463475,11049.5
max,0.768684,0.499984,1.956877,0.693131,0.5509,2215481.0


In [20]:
# Transform data
# data_scaled = StandardScaler().fit_transform(df2)
# print(data_scaled[0:5])

# df2['Total_Population'] = df2['Total_Population']/100_000
df2['Sum_ContaminantFactor'] = df2['Sum_ContaminantFactor']/10_000
df2.head() 


Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor
0,0.309413,0.217826,0.948124,0.375569,0.4428,0.4744
1,0.573395,0.141942,1.305087,0.271086,0.4191,0.4987
2,0.618709,0.197811,1.464587,0.349206,0.4607,1.5583
3,0.43091,0.200263,1.161943,0.352475,0.4561,0.5456
4,0.290288,0.237762,0.919484,0.401124,0.4287,0.0042


# K means

In [37]:
# Initializing model with K = 3 
model = KMeans(n_clusters=10, random_state=13)
model

KMeans(n_clusters=10, random_state=13)

In [38]:
# Fitting model
model.fit(df2)

KMeans(n_clusters=10, random_state=13)

In [39]:
# Get predictions
predictions = model.predict(df2)
print(predictions)

[7 7 9 7 7 0 3 0 3 0 0 9 0 0 0 7 0 0 0 0 0 0 0 3 0 0 3 5 0 3 0 0 0 0 0 0 0
 7 7 0 0 0 3 9 3 3 7 7 7 7 7 3 7 7 3 3 7 7 7 7 3 7 9 7 7 7 7 7 3 3 9 3 7 7
 7 3 7 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 7 0 0 0 0 7 9 3 7 0 0
 3 0 0 3 0 7 0 0 0 3 0 7 0 0 0 0 0 3 0 0 0 7 0 0 0 7 7 7 7 5 7 9 7 3 9 0 5
 7 5 7 7 9 7 7 7 7 3 9 7 9 5 7 7 7 7 7 9 9 7 3 7 7 0 7 7 7 7 9 7 7 7 9 7 7
 7 9 9 0 9 7 7 9 7 9 7 9 7 7 3 7 7 9 3 7 0 7 7 0 7 0 0 7 0 7 0 0 9 7 7 0 3
 7 7 7 7 3 0 7 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 3 0 3 3 0 0 0 3 0 0 0 3 0 0 0
 0 0 3 0 7 0 0 0 0 0 0 3 0 0 7 0 0 0 0 0 0 0 0 9 0 7 0 0 3 3 0 7 0 0 3 0 0
 0 0 0 9 0 0 0 7 3 0 9 0 3 0 3 0 0 9 7 7 0 7 7 7 7 5 7 7 7 0 0 7 0 7 5 0 7
 7 0 7 7 7 3 3 0 0 7 7 0 7 0 7 7 7 7 7 7 7 7 9 7 3 0 7 9 0 3 9 7 3 0 0 9 0
 7 0 9 3 9 7 0 3 0 3 3 3 0 7 0 0 0 0 3 7 0 0 0 0 7 3 9 7 7 3 9 3 9 9 7 3 0
 3 3 0 0 9 0 3 0 9 3 7 7 3 0 0 0 3 0 7 9 7 0 7 3 0 9 0 9 3 9 7 9 7 3 9 0 0
 3 7 0 3 5 0 3 9 0 3 0 9 3 0 9 0 3 7 3 3 3 3 0 3 7 0 3 9 0 3 7 0 0 3 3 3 0
 0 3 3 3 0 3 7 7 7 9 3 5 

In [40]:
# Add a new class column to df_iris
df2["class"] = model.labels_
df2.head()

Unnamed: 0,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,Sum_ContaminantFactor,class
0,0.309413,0.217826,0.948124,0.375569,0.4428,0.4744,7
1,0.573395,0.141942,1.305087,0.271086,0.4191,0.4987,7
2,0.618709,0.197811,1.464587,0.349206,0.4607,1.5583,9
3,0.43091,0.200263,1.161943,0.352475,0.4561,0.5456,7
4,0.290288,0.237762,0.919484,0.401124,0.4287,0.0042,7


In [42]:
df2.hvplot.scatter(x="Simpson_Race_DI", y="Sum_ContaminantFactor", by="class")

In [None]:
fig = px.scatter_3d(
    df2,
    x="")

In [None]:
# PCA

In [46]:
from sklearn.decomposition import PCA

In [47]:
# Transform data
data_scaled = StandardScaler().fit_transform(df2)
print(data_scaled[0:5])

[[ 1.41892358 -0.05501191  1.14034267  0.03054724  0.38723192 -0.12948393
   0.        ]
 [ 0.5593959   0.46285861  0.75218375  0.59572426  0.42970976 -0.11945112
   0.        ]
 [ 2.30622853  0.60099831  1.72482291  0.73765679  1.88912713 -0.12142219
   0.        ]
 [ 1.16153035 -0.13438746  0.78879673 -0.0617325   0.22945707 -0.1109223
   0.        ]
 [ 0.12398314  1.56725567  0.55167888  1.65794185  0.79077144  0.21278999
   0.        ]]


In [49]:
# Applying PCA to reduce dimensions from 4 to 2

# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the data.
data_pca = pca.fit_transform(data_scaled)

In [50]:
data_pca

array([[ 1.30355889e+00, -4.10697066e-01],
       [ 1.23194760e+00, -2.25214659e-01],
       [ 2.90473215e+00, -7.97024712e-01],
       [ 8.92589801e-01, -3.21167820e-01],
       [ 2.04605478e+00,  1.17271576e-01],
       [ 8.84136383e-01, -6.28677297e-01],
       [ 1.68588849e+00, -4.28816689e-01],
       [ 8.51998972e-01, -1.07382835e+00],
       [ 6.26556980e-01, -4.99559320e-01],
       [-1.06253669e+00, -1.89418471e-01],
       [ 1.14401190e+00, -4.32879033e-01],
       [-1.23635558e-01, -4.66022794e-01],
       [ 6.06828226e-01, -8.79122059e-01],
       [ 2.52853513e-02,  2.56250465e-01],
       [ 5.13755047e-02, -5.94410954e-01],
       [ 1.39181631e+00, -7.35295816e-01],
       [ 1.43792095e+00, -3.16812857e-01],
       [-8.72857748e-02, -2.54514881e-01],
       [ 4.84204959e+00,  2.07530764e-01],
       [ 6.86851728e-01, -8.56655756e-01],
       [ 1.94959030e+00, -6.19903382e-01],
       [ 2.80447139e-03, -1.20250501e-01],
       [ 2.79190629e+00, -7.67822252e-01],
       [-7.

In [51]:
# Transform PCA data to a DataFrame
df_pca = pd.DataFrame(
    data=data_pca, columns=["principal component 1", "principal component 2"]
)
df_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,1.303559,-0.410697
1,1.231948,-0.225215
2,2.904732,-0.797025
3,0.89259,-0.321168
4,2.046055,0.117272


In [52]:
pca.explained_variance_ratio_

array([0.4809383 , 0.22711596])

In [None]:
# Elbow