In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns

# Load the data
# Replace 'your_file.csv' with your actual file path
# For this example, I'm assuming the pasted data is saved as a CSV
df = pd.read_csv('data/summer_survey_2024_cleaned.csv')


In [15]:
print(df.columns)


Index(['time_taken', 'gender', 'age', 'region', 'urbanicity', 'education',
       'would_vote_for', 'political_orientation', 'ethnicity', 'religion',
       ...
       'neophobia_sampling_new_foods_numeric',
       'neophobia_dont_trust_new_foods_numeric',
       'neophobia_dont_try_unknown_food_numeric',
       'neophobia_particular_about_food_numeric',
       'neophobia_like_foods_different_countries_numeric',
       'meat_attachment_total_score', 'dairy_attachment_total_score',
       'neophobia_total_score', 'has_cat', 'has_dog'],
      dtype='object', length=182)


In [None]:
# Data preprocessing
# Select relevant features for clustering
# I'll focus on food consumption patterns, attitudes towards meat/dairy, and dietary preferences

# Select numeric features related to consumption, attitudes, and preferences
features = [
    # Consumption patterns
    'red_meat_consumption_3mo_numeric', 'poultry_consumption_3mo_numeric', 
    'seafood_consumption_3mo_numeric', 'dairy_consumption_3mo_numeric',
    'egg_consumption_3mo_numeric', 'pb_meat_consumption_3mo_numeric',
    'pb_milk_consumption_3mo_numeric', 'pb_dairy_consumption_3mo_numeric',
    'fruits_vegetables_consumption_3mo_numeric',
    
    # Attitudes toward meat and dairy
    'ma_fan_of_meat_numeric', 'ma_feel_bad_eating_meat_numeric', 'ma_meat_right_numeric',
    'ma_meat_irreplaceable_numeric', 'ma_fine_meatless_diet_numeric',
    'da_fan_of_dairy_numeric', 'da_feel_bad_eating_dairy_numeric', 'da_dairy_right_numeric',
    'da_dairy_irreplaceable_numeric', 'da_fine_no_dairy_numeric',
    
    # Other relevant attitudes
    'animal_welfare_important', 'reduce_meat_consumption_important',
    'political_orientation_numeric', 'neophobia_total_score', 'urbanicity','would_vote_for', 'political_orientation', 'ethnicity', 'religion',
]

In [3]:
# Drop rows with too many missing values in our selected features
X = df[features].copy()
X = X.dropna(thresh=len(features) * 0.7)  # Keep rows with at least 70% of values

# Fill remaining missing values with median
for col in X.columns:
    X[col] = X[col].fillna(X[col].median())

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# Function to determine optimal number of clusters using silhouette score
def find_optimal_clusters(data, max_clusters=10):
    silhouette_scores = []
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        silhouette_scores.append(silhouette_avg)
        print(f"For n_clusters = {n_clusters}, silhouette score is {silhouette_avg}")
    
    return silhouette_scores

# Find optimal number of clusters
print("Finding optimal number of clusters...")
silhouette_scores = find_optimal_clusters(X_scaled, max_clusters=8)


Finding optimal number of clusters...
For n_clusters = 2, silhouette score is 0.2149891455849484
For n_clusters = 3, silhouette score is 0.11637462247865331
For n_clusters = 4, silhouette score is 0.11186725740042877
For n_clusters = 5, silhouette score is 0.09326144230863068
For n_clusters = 6, silhouette score is 0.07836831835137299
For n_clusters = 7, silhouette score is 0.07885427356117249
For n_clusters = 8, silhouette score is 0.07950977967012789


In [12]:
# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 9), silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score Method for Optimal k')
plt.savefig('silhouette_scores.png')
plt.close()

In [5]:
# Choose optimal number of clusters based on silhouette score
optimal_clusters = np.argmax(silhouette_scores) + 2  # +2 because we start from 2 clusters
print(f"Optimal number of clusters: {optimal_clusters}")

Optimal number of clusters: 2


In [6]:
# Apply K-means clustering with optimal clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
df_clustered = X.copy()
df_clustered['cluster'] = kmeans.fit_predict(X_scaled)

In [7]:
# Apply PCA for visualization
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['cluster'] = df_clustered['cluster']

# Plot clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PC1', y='PC2', hue='cluster', data=pca_df, palette='viridis', s=100)
plt.title(f'Clusters of Dietary Patterns (K={optimal_clusters})')
plt.savefig('diet_clusters.png')
plt.close()

In [8]:
# Analyze clusters
# For each cluster, calculate mean values of key features
cluster_profiles = df_clustered.groupby('cluster').mean()
print("\nCluster Profiles:")
print(cluster_profiles)

# Save cluster profiles to CSV
cluster_profiles.to_csv('cluster_profiles.csv')



Cluster Profiles:
         red_meat_consumption_3mo_numeric  poultry_consumption_3mo_numeric  \
cluster                                                                      
0                                3.263482                         3.889060   
1                                1.257485                         2.077844   

         seafood_consumption_3mo_numeric  dairy_consumption_3mo_numeric  \
cluster                                                                   
0                               2.781202                       5.163328   
1                               1.934132                       4.332335   

         egg_consumption_3mo_numeric  pb_meat_consumption_3mo_numeric  \
cluster                                                                 
0                           3.656394                         0.531587   
1                           3.056886                         2.350299   

         pb_milk_consumption_3mo_numeric  pb_dairy_consumption_3mo_numeric

In [13]:
# Radar chart to visualize cluster differences
def radar_chart(cluster_profiles, category_names):
    # Select a subset of relevant features for clarity
    selected_features = features
    
    subset = cluster_profiles[selected_features]
    
    # Scale the data for radar chart
    scaler_radar = StandardScaler()
    subset_scaled = pd.DataFrame(scaler_radar.fit_transform(subset),
                                 index=subset.index,
                                 columns=subset.columns)
    
    # Number of variables
    categories = selected_features
    N = len(categories)
    
    # Create angles for each feature
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop
    
    # Create radar plot
    fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(polar=True))
    
    # Draw one axis per variable and add labels
    plt.xticks(angles[:-1], categories, size=12)
    
    # Draw y-axis labels
    ax.set_rlabel_position(0)
    
    # Plot each cluster
    for i, cluster in enumerate(subset_scaled.index):
        values = subset_scaled.loc[cluster].values.tolist()
        values += values[:1]  # Close the loop
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=f'Cluster {cluster}')
        ax.fill(angles, values, alpha=0.1)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title('Cluster Profiles: Key Dietary and Attitude Features', size=15)
    plt.savefig('cluster_radar_chart.png')
    plt.close()

# Create radar chart
radar_chart(cluster_profiles, features)

#

In [None]:
# Add demographic analysis of clusters
if 'age' in df.columns and 'gender' in df.columns:
    # Join original demographic data with clusters
    demographic_df = df[['age', 'gender', 'education', 'political_orientation']].copy()
    demographic_df = demographic_df.iloc[df_clustered.index]
    demographic_df['cluster'] = df_clustered['cluster']
    
    # Analyze demographics by cluster
    demographic_analysis = demographic_df.groupby('cluster').agg({
        'age': 'mean',
        'gender': lambda x: x.value_counts().index[0],  # Most common gender
        'education': lambda x: x.value_counts().index[0],  # Most common education
        'political_orientation': lambda x: x.value_counts().index[0]  # Most common political orientation
    })
    
    print("\nDemographic Analysis by Cluster:")
    print(demographic_analysis)
    demographic_analysis.to_csv('cluster_demographics.csv')


Demographic Analysis by Cluster:
               age  gender                        education  \
cluster                                                       
0        47.108025    male  Bachelor's degree or equivalent   
1        46.582583  female  Bachelor's degree or equivalent   

        political_orientation  
cluster                        
0             Liberal-leaning  
1                     Liberal  

Clustering analysis complete. Results saved to CSV files and images.
