In [None]:
# 1. Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import umap

In [None]:
# 2. Load Dataset
df = pd.read_csv('pesticides_new.csv')

print(df.head())
print("Columns:", df.columns)
print("Total records:", df.shape[0])

# 3. Data Preprocessing and Feature Selection
features = [
    "Molecular_Weight",
    "Polar_Area",
    "Complexity",
    "XLogP",
    "Heavy_Atom_Count",
    "H-Bond_Donor_Count",
    "H-Bond_Acceptor_Count",
    "Rotatable_Bond_Count"
]

X = df[features].values
names = df['Name'].values # For later annotation

In [None]:
# Check for NaN values in the scaled data

print(np.isnan(X_scaled).sum())  # Total number of NaNs
print(np.isnan(X_scaled).any())  # Check if any NaNs exist

print(df.isnull().sum())  # Shows the count of NaNs per column

# If you want to list only the columns that have NaNs:
nan_columns = df.columns[df.isnull().any()]
print("Columns with NaN values:", list(nan_columns))

# if there are NaNs, use predictive models to fill them in
    #here is an example where solubility is predicted using a simple Random Forest regression model : https://github.com/insilicosandeep/Solubility-prediction-ML


In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 4. Dimensionality Reduction: PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print("Explained variance ratio (PCA):", pca.explained_variance_ratio_)

In [None]:
# 5. Elbow Method (find optimal k for K-means using PCA-reduced data)
inertias = []
K_range = range(2, 10)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_pca)
    inertias.append(km.inertia_)

plt.figure(figsize=(7,5))
plt.plot(list(K_range), inertias, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(alpha=0.4)
plt.show()


In [None]:
# 6. K-Means Clustering on PCA-reduced data
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)
df['PCA_Cluster'] = clusters

In [None]:
# 7. UMAP Dimensionality Reduction (for visualization and potential clustering)
reducer = umap.UMAP(n_neighbors=10, min_dist=0.3, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

In [None]:
# 8. K-Means Clustering (optional: also on UMAP data if you want to compare)
kmeans_umap = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters_umap = kmeans_umap.fit_predict(X_umap)
df['UMAP_Cluster'] = clusters_umap

In [None]:
# 9. Visualization: PCA space with cluster coloring
plt.figure(figsize=(10,7))
sct = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='Set1', s=120, alpha=0.8, edgecolors='black')
#for i, name in enumerate(names):
    #plt.annotate(name,(X_pca[i,0], X_pca[i,1]), fontsize=0, alpha=0.7)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Pesticide Clusters (K-Means, PCA space)')
plt.colorbar(sct, label='Cluster')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('pesticide_clusters_pca.png')
plt.show()

# Plot loading vectors
for i, feature in enumerate(["Molecular_Weight",
    "Polar_Area",
    "Complexity",
    "XLogP",
    "Heavy_Atom_Count",
    "H-Bond_Donor_Count",
    "H-Bond_Acceptor_Count",
    "Rotatable_Bond_Count"]):
    plt.arrow(0, 0, 
              pca.components_[0, i]*3, 
              pca.components_[1, i]*3,
              head_width=0.1, head_length=0.1, fc='red', ec='red', alpha=0.8)
    plt.text(pca.components_[0, i]*3.2, pca.components_[1, i]*3.2, 
             feature, fontsize=10, ha='center', color='red', weight='bold')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('PCA Biplot (Arrows = Feature Directions)')
plt.grid(alpha=0.3)
plt.axhline(y=0, color='k', linestyle='--', linewidth=0.5)
plt.axvline(x=0, color='k', linestyle='--', linewidth=0.5)
plt.savefig('pca_cluster_with_vectors.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# 10. Visualization: UMAP space with cluster coloring
plt.figure(figsize=(10,7))
sct2 = plt.scatter(X_umap[:,0], X_umap[:,1], c=clusters_umap, cmap='Set1', s=120, alpha=0.8, edgecolors='black')
#for i, name in enumerate(names):
#    plt.annotate(name, (X_umap[i,0], X_umap[i,1]), fontsize=7, alpha=0.7)
plt.xlabel('UMAP Dim 1')
plt.ylabel('UMAP Dim 2')
plt.title('Pesticide Clusters (K-Means, UMAP space)')
plt.colorbar(sct2, label='Cluster')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('pesticide_clusters_umap.png')
plt.show()


In [62]:
# 11. (Optional) Print cluster assignments
print(df[['Name', 'PCA_Cluster', 'UMAP_Cluster']])

                                              Name  PCA_Cluster  UMAP_Cluster
0                                  f-16(Pesticide)            1             2
1                                  FB7 (pesticide)            2             3
2                                       Glyphosate            0             2
3                                       Permethrin            2             0
4                                     Cypermethrin            2             2
...                                            ...          ...           ...
2320  Trimethylammonium 2,4-dichlorophenoxyacetate            0             1
2321                              Caswell No. 413F            2             0
2322                       Lead(IV) arsenate (3:4)            1             0
2323                                           NaN            3             2
2324                                           NaN            2             2

[2325 rows x 3 columns]


In [None]:
# Save cluster members to a text file
df['Cluster'] = clusters 
for i in range(4):  # for clusters 0 to 3
    print(f"\nMembers of Cluster {i}:")
    print(df.loc[df['Cluster'] == i, 'Name'].tolist()) 

with open('clusters_summary.txt', 'w') as f:
    for i in range(4):
        cluster_names = df.loc[df['Cluster'] == i, 'Name'].tolist()
        f.write(f"\nMembers of Cluster {i}:\n")
        for name in cluster_names:
            f.write(str(name) + '\n')