## AnalyzeUp Database and Model Connection

In [12]:
# Import Dependencies 
import pandas as pd
from sqlalchemy import create_engine
import sqlalchemy as db
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import sklearn.metrics as metrics 
import pickle

# Connect to database and read the working_table

In [13]:
engine = create_engine('postgresql://postgres:analyzeup@database-analyzeup.c9mmdejuhxq9.us-west-1.rds.amazonaws.com:5432/analyzeup_project', echo=False)


### Read information table

In [60]:
column_names_info = engine.execute("SELECT * FROM info_comp").keys()
column_names_info

RMKeyView(['index', 'id', 'charity_name', 'address', 'City', 'state', 'zip', 'web_url'])

In [61]:
query_info = engine.execute("SELECT * FROM info_comp").fetchall()
query_info[0:2]

[(0, '6581', 'Society of St. Vincent de Paul of San Mateo County', '50 North B Street ', 'San Mateo', 'CA', '94401', 'http://svdpsm.org/'),
 (1, '14575', 'Society of the Cincinnati', '2118 Massachusetts Avenue, NW ', 'Washington', 'DC', '20008', 'http://www.societyofthecincinnati.org')]

In [62]:
info_df = pd.DataFrame(query_info, columns=column_names_info)
info_df.head(2)

Unnamed: 0,index,id,charity_name,address,City,state,zip,web_url
0,0,6581,Society of St. Vincent de Paul of San Mateo Co...,50 North B Street,San Mateo,CA,94401,http://svdpsm.org/
1,1,14575,Society of the Cincinnati,"2118 Massachusetts Avenue, NW",Washington,DC,20008,http://www.societyofthecincinnati.org


In [63]:
info_df = info_df.drop(columns=['index'])
info_df = info_df.set_index(['id'])
info_df.head(3)

Unnamed: 0_level_0,charity_name,address,City,state,zip,web_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6581,Society of St. Vincent de Paul of San Mateo Co...,50 North B Street,San Mateo,CA,94401,http://svdpsm.org/
14575,Society of the Cincinnati,"2118 Massachusetts Avenue, NW",Washington,DC,20008,http://www.societyofthecincinnati.org
15858,Society of the Four Arts,100 Four Arts Plaza,Palm Beach,FL,33480,http://www.fourarts.org/


### Read working_table

In [3]:
query = engine.execute("SELECT * FROM working_table").fetchall()
query[0:2]

[('10278', ' United Methodist Committee on Relief of Global Ministries', 'Development and Relief Services', '71196129.0', '111327442.0'),
 ('6466', '10,000 Degrees', 'Scholarship and Financial Support', '9258322.0', '13592921.0')]

In [4]:
column_names = engine.execute("SELECT * FROM working_table").keys()
column_names

RMKeyView(['id', 'charity_name', 'cause', 'total_expenses', 'total_net_assets'])

In [5]:
working_df = pd.DataFrame(query, columns=column_names)
working_df.head(2)

Unnamed: 0,id,charity_name,cause,total_expenses,total_net_assets
0,10278,United Methodist Committee on Relief of Globa...,Development and Relief Services,71196129.0,111327442.0
1,6466,"10,000 Degrees",Scholarship and Financial Support,9258322.0,13592921.0


In [6]:
working_df['total_expenses'] = working_df['total_expenses'].astype('float')
# working_df['total_expenses'] = working_df['total_expenses'].astype('int')

working_df['total_net_assets'] = working_df['total_net_assets'].astype('float')
# working_df['total_net_assets'] = working_df['total_net_assets'].astype('int')

working_df = working_df.set_index(['id'])
working_df.head(3)

Unnamed: 0_level_0,charity_name,cause,total_expenses,total_net_assets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10278,United Methodist Committee on Relief of Globa...,Development and Relief Services,71196129.0,111327442.0
6466,"10,000 Degrees",Scholarship and Financial Support,9258322.0,13592921.0
12098,100 Black Men of America,"Youth Development, Shelter, and Crisis Services",4366870.0,3407241.0


In [None]:
# # add information columns
# add_columns = ['address', 'state', 'zip']

# for addone in 

# Kmeans Clustering Model 

## Preprocessing the Data for PCA

In [7]:
# Create pandas dataframe and find rows and columns 
cdl_df = working_df
cdl_df.shape

(8143, 4)

In [8]:
# # Load the CSV dataset (for testing purposes - before joining to database).
# file_path = "Kmeans_Final_Model.csv"
# cdl_df = pd.read_csv(file_path,index_col=0,encoding='latin1')
# cdl_df.head(10)

In [9]:
# Check the dataypes
cdl_df.dtypes

charity_name         object
cause                object
total_expenses      float64
total_net_assets    float64
dtype: object

In [10]:
# Remove rows that have at null values.
cdl_df.dropna(inplace=True)
print(cdl_df.shape)
cdl_df.head(10)

(8143, 4)


Unnamed: 0_level_0,charity_name,cause,total_expenses,total_net_assets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10278,United Methodist Committee on Relief of Globa...,Development and Relief Services,71196129.0,111327442.0
6466,"10,000 Degrees",Scholarship and Financial Support,9258322.0,13592921.0
12098,100 Black Men of America,"Youth Development, Shelter, and Crisis Services",4366870.0,3407241.0
12123,100 Club of Arizona,Multipurpose Human Service Organizations,2201743.0,7132258.0
17473,100 Club of Chicago,Social Services,1408074.0,8178182.0
8770,1000 Friends of Oregon,Environmental Protection and Conservation,1564775.0,3966190.0
17318,18Doors,Religious Activities,2463792.0,1747802.0
15235,24 Foundation,Medical Research,1862821.0,639656.0
16289,350.org,Environmental Protection and Conservation,15450256.0,14993196.0
13055,4 Paws for Ability,Social Services,4151432.0,4832614.0


In [11]:
# Check there are no null values 
for column in cdl_df.columns:
    print (f"Column {column} has {cdl_df[column].isnull().sum()}null values")

Column charity_name has 0null values
Column cause has 0null values
Column total_expenses has 0null values
Column total_net_assets has 0null values


In [12]:
# Find duplicate entries
print(f"Duplicate entries: {cdl_df.duplicated().sum()}")

Duplicate entries: 0


In [13]:
# Create a new DataFrame that holds only charities_names.
charity_name_df = pd.DataFrame(cdl_df["charity_name"])
print(charity_name_df.shape)
charity_name_df.head()

(8143, 1)


Unnamed: 0_level_0,charity_name
id,Unnamed: 1_level_1
10278,United Methodist Committee on Relief of Globa...
6466,"10,000 Degrees"
12098,100 Black Men of America
12123,100 Club of Arizona
17473,100 Club of Chicago


In [14]:
# Drop the columns that are not going to be used in the clustering algorithm.
cdl_df = cdl_df.drop(["charity_name"], axis=1)
print(cdl_df.shape)
cdl_df.head(10)

(8143, 3)


Unnamed: 0_level_0,cause,total_expenses,total_net_assets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10278,Development and Relief Services,71196129.0,111327442.0
6466,Scholarship and Financial Support,9258322.0,13592921.0
12098,"Youth Development, Shelter, and Crisis Services",4366870.0,3407241.0
12123,Multipurpose Human Service Organizations,2201743.0,7132258.0
17473,Social Services,1408074.0,8178182.0
8770,Environmental Protection and Conservation,1564775.0,3966190.0
17318,Religious Activities,2463792.0,1747802.0
15235,Medical Research,1862821.0,639656.0
16289,Environmental Protection and Conservation,15450256.0,14993196.0
13055,Social Services,4151432.0,4832614.0


In [15]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(cdl_df, columns=["cause"])
print(X.shape)
X.head(10)

(8143, 49)


Unnamed: 0_level_0,total_expenses,total_net_assets,cause_ 59% are people of color,cause_ businesses and corporations,cause_ community colleges,cause_ emergency shelter/stabilization and community early education and development. <br>FUMCH is accredited by COA (Council on Accreditation),cause_ interest and economic backgrounds,cause_ to share the Gospel of John one-on-one. <br>2. We reach out to Partners and we help these leaders to use their business,cause_ with more than 20 miles of hiking trails,cause_374,...,cause_Scholarship and Financial Support,cause_Social Services,cause_Social and Public Policy Research,cause_Special Education,cause_Treatment and Prevention Services,cause_United Ways,cause_Wildlife Conservation,"cause_Youth Development, Shelter, and Crisis Services",cause_Youth Education Programs and Services,cause_Zoos and Aquariums
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10278,71196129.0,111327442.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6466,9258322.0,13592921.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12098,4366870.0,3407241.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12123,2201743.0,7132258.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17473,1408074.0,8178182.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8770,1564775.0,3966190.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17318,2463792.0,1747802.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15235,1862821.0,639656.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16289,15450256.0,14993196.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13055,4151432.0,4832614.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[ 1.09259182  0.75318991 -0.01108242 -0.01108242 -0.01108242 -0.01108242
  -0.01108242 -0.01108242 -0.01108242 -0.01108242 -0.01108242 -0.08900434
  -0.2153813  -0.19591597 -0.11862643 -0.20126104 -0.12934905  4.65644191
  -0.16702394 -0.05436938 -0.07700381 -0.19893756 -0.16779918 -0.140212
  -0.17463851 -0.0778066  -0.14602042 -0.09897791 -0.12073783 -0.12934905
  -0.16741196 -0.22638962 -0.06378911 -0.186249   -0.27331255 -0.12786897
  -0.18088769 -0.12435142 -0.13605723 -0.1351179  -0.29869522 -0.14602042
  -0.04437056 -0.14689527 -0.22032916 -0.10864702 -0.26438387 -0.2009305
  -0.09244431]
 [-0.10199882 -0.14883267 -0.01108242 -0.01108242 -0.01108242 -0.01108242
  -0.01108242 -0.01108242 -0.01108242 -0.01108242 -0.01108242 -0.08900434
  -0.2153813  -0.19591597 -0.11862643 -0.20126104 -0.12934905 -0.21475625
  -0.16702394 -0.05436938 -0.07700381 -0.19893756 -0.16779918 -0.140212
  -0.17463851 -0.0778066  -0.14602042 -0.09897791 -0.12073783 -0.12934905
  -0.16741196 -0.22638962 -0

## Reducing Data Dimensions Using PCA

In [17]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca_reduce = pca.fit_transform(X_scaled)
pca_reduce

array([[ 1.49935742,  1.39482109,  1.45913619],
       [ 0.24935478, -1.1069393 ,  0.1944422 ],
       [-0.54434279,  0.32209496, -0.30278173],
       ...,
       [ 0.910632  ,  0.7703589 ,  0.55068734],
       [ 0.82910002,  0.77102049,  0.55419858],
       [ 0.99961346,  0.77795942,  0.54468432]])

In [18]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data = pca_reduce, columns=["PC 1", "PC 2", "PC 3"],
    index= X.index
)
print(pcs_df.shape)
pcs_df.head(10)

(8143, 3)


Unnamed: 0_level_0,PC 1,PC 2,PC 3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10278,1.499357,1.394821,1.459136
6466,0.249355,-1.106939,0.194442
12098,-0.544343,0.322095,-0.302782
12123,-0.094648,0.065221,0.669248
17473,-0.48934,-0.566606,-2.338448
8770,-0.580834,-0.959028,-0.34852
17318,-0.647536,0.970587,-0.577008
15235,0.266781,0.613196,-1.55392
16289,-0.356657,-0.911833,-0.37096
13055,-0.478276,-0.549649,-2.343371


## Clustering Charity Data Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [19]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


### Testing Model and Saving the Model with Pickle

#### Running K-Means with `k=4`

In [49]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# save the model to disk using Pickle
filename='Final_Kmeans_Model.sav'
pickle.dump(model, open('Final_Kmeans_Model.sav', 'wb'))

# Predict clusters
predictions = model.predict(pcs_df)
predictions

array([2, 0, 2, ..., 2, 2, 2])

In [50]:
# Create a new DataFrame including predicted clusters and charity features.
# Concatentate the cdl_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([cdl_df, pcs_df], axis=1)

#  Add a new column, "charity_name" to the clustered_df DataFrame that holds the names of the charities. 
clustered_df['charity_name'] = charity_name_df["charity_name"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(8143, 8)


Unnamed: 0_level_0,cause,total_expenses,total_net_assets,PC 1,PC 2,PC 3,charity_name,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10278,Development and Relief Services,71196129.0,111327442.0,1.499357,1.394821,1.459136,United Methodist Committee on Relief of Globa...,2
6466,Scholarship and Financial Support,9258322.0,13592921.0,0.249355,-1.106939,0.194442,"10,000 Degrees",0
12098,"Youth Development, Shelter, and Crisis Services",4366870.0,3407241.0,-0.544343,0.322095,-0.302782,100 Black Men of America,2
12123,Multipurpose Human Service Organizations,2201743.0,7132258.0,-0.094648,0.065221,0.669248,100 Club of Arizona,2
17473,Social Services,1408074.0,8178182.0,-0.48934,-0.566606,-2.338448,100 Club of Chicago,3
8770,Environmental Protection and Conservation,1564775.0,3966190.0,-0.580834,-0.959028,-0.34852,1000 Friends of Oregon,0
17318,Religious Activities,2463792.0,1747802.0,-0.647536,0.970587,-0.577008,18Doors,2
15235,Medical Research,1862821.0,639656.0,0.266781,0.613196,-1.55392,24 Foundation,3
16289,Environmental Protection and Conservation,15450256.0,14993196.0,-0.356657,-0.911833,-0.37096,350.org,0
13055,Social Services,4151432.0,4832614.0,-0.478276,-0.549649,-2.343371,4 Paws for Ability,3


### Visualizing Prediction Results

#### 3D-Scatter with Clusters

In [51]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
    hover_name = "charity_name",
    hover_data = ["cause"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [52]:
# Create a table with charity information.
clustered_df.hvplot.table()

In [53]:
# Print the total number of rated charities.
print(f' There are {clustered_df["charity_name"].count()} rated charities')

 There are 8143 rated charities


In [54]:
# Scaling data to create the scatter plot with rated charities.
scaling_data = clustered_df.copy()
scaling_data_2 = scaling_data.drop(
    ["cause", "PC 1", "PC 2", "PC 3", "charity_name", "class"], 
    axis=1
)
min_max = MinMaxScaler().fit_transform(scaling_data_2)
min_max

array([[0.0423029 , 0.06728366],
       [0.00546207, 0.01559521],
       [0.00255261, 0.01020835],
       ...,
       [0.01084533, 0.02293513],
       [0.00933338, 0.01830701],
       [0.01319194, 0.0268156 ]])

In [55]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
min_max_df = pd.DataFrame(
    data=min_max,
    columns=["Total Expenses", "Total Net Assests"],
    index=clustered_df.index
)

# Add the "Charity Name" column from the clustered_df DataFrame to the new DataFrame.
plot_df = min_max_df.reindex(columns=["Total Expenses","Total Net Assests"])
plot_df["Charity Name"] = charity_name_df["charity_name"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["Class"] = clustered_df["class"]


plot_df.head(10)

Unnamed: 0_level_0,Total Expenses,Total Net Assests,Charity Name,Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10278,0.042303,0.067284,United Methodist Committee on Relief of Globa...,2
6466,0.005462,0.015595,"10,000 Degrees",0
12098,0.002553,0.010208,100 Black Men of America,2
12123,0.001265,0.012178,100 Club of Arizona,2
17473,0.000793,0.012732,100 Club of Chicago,3
8770,0.000886,0.010504,1000 Friends of Oregon,0
17318,0.001421,0.009331,18Doors,2
15235,0.001063,0.008745,24 Foundation,3
16289,0.009145,0.016336,350.org,0
13055,0.002424,0.010962,4 Paws for Ability,3


In [56]:
# Create a hvplot.scatter plot using x="Total Expenses" and y="Total Net Assests".
plot_df.hvplot.scatter(
    x="Total Expenses",
    y="Total Net Assests",
    by="Class",
    hover_cols=["Charity Name"],
)

## Evaluating the Model

In [57]:
# Test the model's clustering performance with Silhouette Coefficient evaluation
kmeans_model = KMeans(n_clusters=4, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.9066127602592019

##### Resource: https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation *2.3.10.5. Silhouette Coefficient*

In [29]:
# # load the model from disk using Pickle (in the future when needed)
# loaded_model = pickle.load(open('Final_Kmeans_Model.sav', 'rb'))
# result = metrics.silhouette_score(X, labels, metric='euclidean')
# print (result)

In [None]:
# jsonstr = plot_df.to_json(orient='columns')

In [66]:
# # convert dataframe to dictionary for json file save (for UI)
# pd.io.json.dumps(plot_df.to_dict(orient='list'))


'{"Total Expenses":[0.0423028998,0.0054620654,0.0025526118,0.0012647863,0.0007927091,0.0008859154,0.001420654,0.0010631943,0.0091450504,0.0024244685,0.0024738901,0.0018529579,0.0009237009,0.0025426958,0.0010874409,0.0010673419,0.001600324,0.003607191,0.0019828167,0.0028480151,0.0019793882,0.0042355392,0.0008523471,0.0034216003,0.0010581979,0.1016211886,0.0007955642,0.0008975195,0.0307379692,0.0016017504,0.0010061859,0.0009402222,0.0039981364,0.001089964,0.0013463595,0.0034276489,0.0017529405,0.0036111357,0.002471119,0.0013267804,0.0007868485,0.0007442172,0.0025690819,0.0016770263,0.0067162678,0.0176318262,0.0013894578,0.0182299316,0.0063817401,0.0013526496,0.026301042,0.0030459075,0.0014733626,0.001491761,0.0065750763,0.0022495022,0.0028033513,0.0031628756,0.0011413569,0.075688198,0.0035081889,0.0029645258,0.0060453536,0.0019875537,0.0007165784,0.0016776372,0.0008729963,0.0064312866,0.0004848364,0.0006713894,0.006319272,0.0134419638,0.0009212498,0.0009022761,0.0016722483,0.0010313122,0

In [67]:
# # From pandas import DataFrame as data.js file for UI.
# plot_df.to_json(r'C:\Users\14698\Desktop\Data Class Folder\UCB-VIRT-DATA-PT-02-2022-U-B\FinalProject\testing_files\Aimee_Decoste\AnalyzeUp_UI\static\js\data.js')
