In [1]:
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from tqdm import tqdm
df = pd.read_csv('dataset/incident_profile_95_cleaned.csv')

selected_features = ['avg_age', 'avg_crash_severity_score', 'avg_lighting_severity',
           'combined_weather_road_severity', 'fatality_rate', 'speeding_influence',
           'damage_cost_LB', 'weekend_crash_rate', 'pct_neo_patented_drivers']

best_score = -1
best_features = None
best_k = None

NUMBER_OF_FEATURES = 7
# Store all combinations and their scores
combinations_scores = []
for combo in tqdm(combinations(selected_features, NUMBER_OF_FEATURES), desc=f'Feature combinations of size {NUMBER_OF_FEATURES}', total=len(list(combinations(selected_features, NUMBER_OF_FEATURES)))):
    X = df[list(combo)]
    
    #print(f'Features: {combo}')
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Test different values of K
    for k in range(2, 11):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        score = silhouette_score(X_scaled, kmeans.labels_)
        
        combinations_scores.append((combo, k, score))

# Sort combinations by score in descending order
combinations_scores.sort(key=lambda x: x[2], reverse=True)

# Print the best combination
best_combo, best_k, best_score = combinations_scores[0]
print(f'Best Silhouette Score: {best_score}')
print(f'Best Feature Set: {best_combo}')
print(f'Best K: {best_k}')

Feature combinations of size 7: 100%|██████████| 36/36 [36:38<00:00, 61.07s/it]

Best Silhouette Score: 0.502052683871164
Best Feature Set: ('avg_age', 'avg_crash_severity_score', 'combined_weather_road_severity', 'fatality_rate', 'speeding_influence', 'damage_cost_LB', 'pct_neo_patented_drivers')
Best K: 2





In [2]:
combinations_scores_df = pd.DataFrame(combinations_scores, columns=['Features', 'K', 'Score'])
combinations_scores_df.to_csv('combinations_scores7.csv', index=False)