## AnalyzeUp Database and Model Connection

In [77]:
# Import Dependencies 
import pandas as pd
from sqlalchemy import create_engine
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import sklearn.metrics as metrics 
import pickle

# Connect to database and read the working_table

In [78]:
engine = create_engine('postgresql://postgres:analyzeup@database-analyzeup.c9mmdejuhxq9.us-west-1.rds.amazonaws.com:5432/analyzeup_project', echo=False)


In [79]:
query = engine.execute("SELECT * FROM working_table").fetchall()
query[0:2]

OperationalError: (psycopg2.OperationalError) could not translate host name "database-analyzeup.c9mmdejuhxq9.us-west-1.rds.amazonaws.com" to address: Unknown host

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
column_names = engine.execute("SELECT * FROM working_table").keys()
column_names

In [None]:
working_df = pd.DataFrame(query, columns=column_names)
working_df.head(2)

In [None]:
working_df['total_expenses'] = working_df['total_expenses'].astype('float')
# working_df['total_expenses'] = working_df['total_expenses'].astype('int')

working_df['total_net_assets'] = working_df['total_net_assets'].astype('float')
# working_df['total_net_assets'] = working_df['total_net_assets'].astype('int')

working_df = working_df.set_index(['id'])
working_df.head(3)

# Kmeans Clustering Model 

## Preprocessing the Data for PCA

In [None]:
# Load the  database tables, create pandas dataframe and find rows and columns 
cdl_df = working_df
cdl_df.shape

In [None]:
# # Load the CSV dataset (for testing purposes - before joining to database).
# file_path = "Kmeans_Final_Model.csv"
# cdl_df = pd.read_csv(file_path,index_col=0,encoding='latin1')
# cdl_df.head(10)

In [None]:
# Check the dataypes
cdl_df.dtypes

In [None]:
# Remove rows that have at least 1 null value.
cdl_df.dropna(inplace=True)
print(cdl_df.shape)
cdl_df.head(10)

In [None]:
# Check there are no null values 
for column in cdl_df.columns:
    print (f"Column {column} has {cdl_df[column].isnull().sum()}null values")

In [None]:
# Find duplicate entries
print(f"Duplicate entries: {cdl_df.duplicated().sum()}")

In [None]:
# Create a new DataFrame that holds only charities_names.
charity_name_df = pd.DataFrame(cdl_df["charity_name"])
print(charity_name_df.shape)
charity_name_df.head()

In [None]:
# Drop the columns that are not going to be used in the clustering algorithm.
cdl_df = cdl_df.drop(["charity_name"], axis=1)
print(cdl_df.shape)
cdl_df.head(10)

In [None]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(cdl_df, columns=["cause"])
print(X.shape)
X.head(10)

In [None]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

## Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca_reduce = pca.fit_transform(X_scaled)
pca_reduce

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data = pca_reduce, columns=["PC 1", "PC 2", "PC 3"],
    index= X.index
)
print(pcs_df.shape)
pcs_df.head(10)

## Clustering Charity Data Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


### Testing Model and Saving the Model with Pickle

#### Running K-Means with `k=5`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# save the model to disk using Pickle
filename='Final_Kmeans_Model.sav'
pickle.dump(model, open('Final_Kmeans_Model.sav', 'wb'))

# Predict clusters
predictions = model.predict(pcs_df)
predictions

In [None]:
# Create a new DataFrame including predicted clusters and charity features.
# Concatentate the cdl_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([cdl_df, pcs_df], axis=1)

#  Add a new column, "charity_name" to the clustered_df DataFrame that holds the names of the charities. 
clustered_df['charity_name'] = charity_name_df["charity_name"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Visualizing Prediction Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    width=800,
    hover_name = "charity_name",
    hover_data = ["cause"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [None]:
# Create a table with charity information.
clustered_df.hvplot.table()

In [80]:
# Print the total number of rated charities.
print(f' There are {clustered_df["charity_name"].count()} rated charities')

 There are 8143 rated charities


In [81]:
# Scaling data to create the scatter plot with rated charities.
scaling_data = clustered_df.copy()
scaling_data_2 = scaling_data.drop(
    ["cause", "PC 1", "PC 2", "PC 3", "charity_name", "class"], 
    axis=1
)
min_max = MinMaxScaler().fit_transform(scaling_data_2)
min_max

array([[0.0423029 , 0.06728366],
       [0.00546207, 0.01559521],
       [0.00255261, 0.01020835],
       ...,
       [0.01084533, 0.02293513],
       [0.00933338, 0.01830701],
       [0.01319194, 0.0268156 ]])

In [82]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
min_max_df = pd.DataFrame(
    data=min_max,
    columns=["Total Expenses", "Total Net Assests"],
    index=clustered_df.index
)

# Add the "Charity Name" column from the clustered_df DataFrame to the new DataFrame.
plot_df = min_max_df.reindex(columns=["Total Expenses","Total Net Assests"])
plot_df["Charity Name"] = charity_name_df["charity_name"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["Class"] = clustered_df["class"]


plot_df.head(10)

Unnamed: 0_level_0,Total Expenses,Total Net Assests,Charity Name,Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10278,0.042303,0.067284,United Methodist Committee on Relief of Globa...,2
6466,0.005462,0.015595,"10,000 Degrees",4
12098,0.002553,0.010208,100 Black Men of America,0
12123,0.001265,0.012178,100 Club of Arizona,0
17473,0.000793,0.012732,100 Club of Chicago,4
8770,0.000886,0.010504,1000 Friends of Oregon,0
17318,0.001421,0.009331,18Doors,0
15235,0.001063,0.008745,24 Foundation,0
16289,0.009145,0.016336,350.org,0
13055,0.002424,0.010962,4 Paws for Ability,4


In [83]:
# Create a hvplot.scatter plot using x="Total Expenses" and y="Total Net Assests".
plot_df.hvplot.scatter(
    x="Total Expenses",
    y="Total Net Assests",
    by="Class",
    hover_cols=["Charity Name"],
)

## Evaluating the Model

In [84]:
# Test the model's clustering performance with Silhouette Coefficient evaluation
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.8679943613709157

##### Resource: https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation *2.3.10.5. Silhouette Coefficient*

In [85]:
# # load the model from disk using Pickle (in the future when needed)
# loaded_model = pickle.load(open('Final_Kmeans_Model.sav', 'rb'))
# result = metrics.silhouette_score(X, labels, metric='euclidean')
# print (result)

In [87]:
#from pandas import DataFrame as data.js file for UI.
plot_df.to_json(r'C:\Users\14698\Desktop\Data Class Folder\UCB-VIRT-DATA-PT-02-2022-U-B\FinalProject\Aimee_Decoste\AnalyzeUp_UI\static\js\data.js')
