# Feature Engineering & Clustering/Segmentation Analysis

## Author: Nika Faraji

### Data Science Capstone

In [None]:
import sys
import numpy as np
import pandas as pd

df = pd.read_csv('data/aggregated_data.csv')
df

In [27]:
#JH: Good use of custom function, it might make the rest of the code
#easier to read if more were used
def calcSplitRatio(df, p=36):

    ## Calculate ideal number for testing set
    test_N = (1 / np.sqrt(p)) * len(df)
    
    ## Calculate testing proportion
    test_prop = round(test_N / len(df), 2)
    
    ## Calculate training proportion
    train_prop = 1 - test_prop
    
    ## Print the results
    print(f"The ideal split ratio is {train_prop}:{test_prop} training:testing")
    
    ## Return the size of the training set proportion
    return train_prop

In [None]:
# Example usage of calcSplitRatio
train_prop = calcSplitRatio(df, p=50)
train_prop

In [None]:
correlation_matrix

In [None]:
#JH: This would be a good place to use a custom function, it doesnt need to do much
#just seems that youre using the notebook code blocks as substitutes for functions
#Not a huge problem in a project like this but can introduce problems when scaling
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score


#Feature Selection
features = ['UTIL_RATE', 'median_anxiety', 'median_depression', 'median_PTSD', 'median_ADHD', 'median_bipolar', 
           'max_psychiatrists near me','max_therapist near me', 'max_mental hospital']  # Replace with your actual column names
X = df[features]

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine Optimal k
inertia = []
silhouette_scores = []
k_range = range(2, 11)  # Testing from 2 to 10 clusters

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot Elbow Method
plt.figure(figsize=(10, 5))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Fit K-means with Optimal Clusters
optimal_k = 3  # Replace with the number of clusters determined
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add Clusters to Original DataFrame
df['Cluster'] = clusters

# Analyze Clusters
print(df.groupby('Cluster').mean())  # Check the mean values of features in each cluster

# Visualize Clusters 
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='viridis')
plt.title('Cluster Visualization')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
#JH: Contains redundant imports, consider moving all imports to the top of the file to make it easier to spot
import seaborn as sns
import matplotlib.pyplot as plt

#Visualize the distribution of a key feature across clusters
sns.boxplot(x='Cluster', y='median_anxiety', data=df_with_clusters) #can use diff features to compare
plt.title(f"Distribution of {feature} Across Clusters")
plt.show()

In [None]:
from pandas.plotting import parallel_coordinates

parallel_coordinates(df, class_column='Cluster', cols=features, color=['r', 'g', 'b'])
plt.title('Parallel Coordinates Plot for Clusters')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Add PCA to DataFrame
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

# Visualize
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis')
plt.title('Clusters Visualized Using PCA')
plt.show()

# Print feature contributions to PCA components
print(pd.DataFrame(pca.components_, columns=features, index=['PCA1', 'PCA2']))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

# Train a decision tree to predict clusters
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_scaled, kmeans.labels_)

# Get feature importances
importances = clf.feature_importances_
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(importance_df)


In [None]:
import pandas as pd

# Assuming `kmeans` is your fitted K-means model
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=features)

# Display cluster centers (features for each cluster)
print(cluster_centers)

import seaborn as sns
import matplotlib.pyplot as plt

# Plot heatmap of cluster centers
plt.figure(figsize=(10, 6))
sns.heatmap(cluster_centers, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Cluster Centers (Centroids)')
plt.show()


In [None]:
for i in range(len(cluster_centers)):
    print(f"Top features for Cluster {i}:")
    print(cluster_centers.iloc[i].sort_values(ascending=False).head(5))  # Top 5 features
    print()
