In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

: 

In [None]:
df = pd.read_csv('data//dataset_Facebook.csv')
df.head()

In [None]:
df.columns

In [None]:
df.columns = ['page_TL','type','category','post_M','post_W','post_H','paid','LPTR','LPTI','LEU','LP_Consumers','LP_consumptions', \
'LPI_PLpage','LPR_PLpage','L_PLP_EP','comment','like','share','TI']
df.head()

In [None]:
df.isna().sum()

In [None]:
df.value_counts(df['type'])

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
df.value_counts(df['paid'])

In [None]:
df.value_counts(df['type'])

In [None]:
df['type'] = df['type'].map({'Link': 0, 'Photo': 1, 'Status': 2, 'Video': 3})
df = df.drop(['post_M','post_W','post_H'], axis=1)

##### ♦️ KMean clustering

In [None]:
from sklearn.cluster import KMeans
from kneed import KneeLocator

wcss = []

for i in range(1, 31):
    km = KMeans(n_clusters=i, init='k-means++',n_init=10, random_state=0)
    model = km.fit(df)
    wcss.append(km.inertia_)

kl = KneeLocator(range(1, 31), wcss, curve='convex', direction='decreasing')
k_point = kl.elbow
print(k_point)

plt.plot(range(1, 31), wcss, marker='o')
plt.axvline(x=k_point, color='r', linestyle='--')
plt.title('WCSS vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5, init='k-means++', n_init=10, random_state=0)
model = km.fit(df)
df['cluster'] = model.labels_
df['cluster'].value_counts()

In [None]:
cluster_summary = df.groupby('cluster')[df.columns].mean()
cluster_summary.T

In [None]:
cluster_summary_of_type = df.groupby('cluster')['type'].value_counts()

cluster_summary_of_type

In [None]:
from sklearn.metrics import silhouette_score
ss_km = silhouette_score(df, df['cluster'])
print(ss_km)

In [None]:
plt.plot(df['cluster'], df['TI'], 'o')
plt.show()

In [None]:
plt.scatter(df['LPTR'], df['LPTI'], c=df['cluster'], s=50, cmap='viridis')
plt.title('Clusters based on LPTR and LPTI')
plt.xlabel('LPTR')
plt.ylabel('LPTI')
plt.show()

for green and blue clusters there is life time total reach is low but high impression. 

In [None]:
print(df.groupby(df['cluster']==1)['category'].value_counts())
print(df.groupby(df['cluster']==3)['category'].value_counts())

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = df.drop(['cluster'], axis=1)
pca_scaled = pca.fit_transform(X)

plt.scatter(pca_scaled[:, 0], pca_scaled[:, 1], c=df['cluster'], s=50, cmap='viridis')
plt.title('PCA of Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

category 3 and 1 has high impression and low reach.

In [None]:
df.drop(['cluster','comment','like','share','page_TL'], axis=1, inplace=True)

In [None]:
df

##### ♦️ standardize the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df)
x_scaled.shape            

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_scaled = pca.fit_transform(x_scaled)
pca_scaled.shape

In [None]:
plt.scatter(pca_scaled[:, 0], pca_scaled[:, 1])

##### ♦️ Hierarchycal clustering

In [None]:
import scipy.cluster.hierarchy as sc

plt.figure(figsize=(10, 7))
sc.dendrogram(sc.linkage(pca_scaled, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

here according to dendrogram value of k is 2

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=2, linkage='ward')
hc.fit(x_scaled)

In [None]:
hc.labels_

In [None]:
plt.scatter(pca_scaled[:, 0], pca_scaled[:, 1], c=hc.labels_, s=50, cmap='viridis')
plt.title('Agglomerative Clustering of PCA Components')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
ss_hc = silhouette_score(x_scaled, hc.labels_)
print(ss_hc)

In [None]:
df.corr()['L_PLP_EP'].sort_values(ascending=False)

In [None]:
plt.plot(df['L_PLP_EP'], df['LPTR'], 'o')
plt.title('L_PLP_EP vs LPTR')
plt.xlabel('Lifetime People who have liked your Page and engaged with your post')
plt.ylabel('Lifetime Post Total Reach')
plt.show()

in this mostly we show is lifetime post reach is low then Lifetime People who have liked your Page and engaged with your post is also low.

In [None]:
reach_threshold = 60000
engaged_threshold = 1500

colors = [
    'green' if ((r < reach_threshold) and (e > engaged_threshold)) 
    else 'red' if ((r > reach_threshold) and (e < engaged_threshold)) 
    else 'blue' for r, e in zip(df['LPTR'], df['L_PLP_EP'])
]

plt.scatter(df['L_PLP_EP'], df['LPTR'], c=colors, s=50)
plt.title('Scatter plot of LPTR and L_PLP_EP with thresholds')
plt.xlabel('Lifetime People who have liked your Page and engaged with your post')
plt.ylabel('Lifetime Post Total Reach')
plt.show()

🌟 in this graph green data points states that it has a low reach but high engagement so that is a good post for people or for organization and red data points states that it has a high reach but low engagement so that type of post is not entertain the people and blue data point is everage posts that increase exponantialy.

##### ♦️ DBSCAN clustering

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=1.3, min_samples=5)
dbscan.fit(x_scaled)
labels = dbscan.labels_

In [None]:
x = labels != -1

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_scaled = pca.fit_transform(x_scaled)
pca_filtered = pca_scaled[x]
labels_filtered = labels[x]

plt.scatter(pca_filtered[:, 0], pca_filtered[:, 1], c=labels_filtered, s=50, cmap='viridis')
plt.title('DBSCAN Clustering of PCA Components')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
ss_db = silhouette_score(x_scaled, labels)
print(ss_db)

In [None]:
score = [ss_km, ss_hc, ss_db]
method = ['KMeans', 'Agglomerative', 'DBSCAN']

plt.bar(method, score, color=['green', 'gray', 'red'], width=0.4)
plt.ylim(0, 1)
plt.title('Silhouette Score of Different Clustering Methods')
plt.xlabel('Clustering Method')
plt.ylabel('Silhouette Score')
plt.show()

🌟 KMeans method has highest silhouette score for this data and DBSCAN has less score then KMean and agglomerative method for this perticuler data.

In [None]:
df[df['TI'] == df['TI'].max()]

category 2 has highest intrection and it is paid in this post the total number of reach is lower then total impretion so i think it has a fake interaction