In [None]:
import pandas as pd

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.cluster import KMeans

In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset

In [None]:
df = pd.read_csv('data.csv')

In [None]:
print("First 5 rows of the dataset:")

In [None]:
print(df.head())

In [None]:
print("\nInformation about the dataset:")

In [None]:
print(df.info())

# Step 2: Preprocess the data (scaling)

In [None]:
numerical_cols = ['Retweets', 'Likes', 'Year', 'Month', 'Day', 'Hour']

In [None]:
missing_cols = [col for col in numerical_cols if col not in df.columns]
if missing_cols:
    print(f"Warning: The following numerical columns were not found in the dataset: {missing_cols}")
    # Filter to only include columns that are actually in the DataFrame
    numerical_cols = [col for col in numerical_cols if col in df.columns]


NameError: name 'missing_cols' is not defined

In [None]:
if not numerical_cols:
    raise ValueError("No suitable numerical columns found for clustering after initial selection. Please check the dataset columns.")

In [None]:
X = df[numerical_cols]

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
print("\nData scaled successfully.")

# Step 3: Apply K-Means clustering and determine optimal K (Elbow Method)

In [None]:
inertia = []
range_k = range(1, 11)

In [None]:
for k in range_k:
    # Initialize KMeans with n_init to suppress warning
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range_k, inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.show()
print("\nElbow Method plot displayed. Look for the 'elbow' point to determine optimal K.")

# Step 4: Apply K-Means clustering with the chosen optimal K

In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

In [None]:
df['Cluster'] = clusters
print(f"\nK-Means clustering applied with K={optimal_k}. Cluster assignments added to DataFrame.")

# Step 5: Visualize clusters using 2D scatter plots

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['Cluster'] = clusters

In [None]:
if 'Sentiment' in df.columns:
    pca_df['Sentiment'] = df['Sentiment']

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='Cluster',
    palette='viridis', # A color palette for clusters
    data=pca_df,
    legend='full',
    alpha=0.7
)

In [None]:
plt.title(f'K-Means Clusters (PCA-Reduced Data) with K={optimal_k}')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.grid(True)
plt.show()
print("\n2D Scatter plot of clusters displayed.")

# Step 6: Interpret the clustering results

In [None]:
cluster_analysis = df.groupby('Cluster')[numerical_cols].mean()
print("\nMean values of numerical features per cluster:")
print(cluster_analysis)

In [None]:
if 'Sentiment' in df.columns:
    sentiment_distribution = df.groupby('Cluster')['Sentiment'].value_counts(normalize=True).unstack(fill_value=0)
    print("\nSentiment distribution (normalized) within each cluster:")
    print(sentiment_distribution)

In [None]:
print("\nClustering analysis complete. Review the mean feature values and sentiment distribution for interpretation.")