In [None]:
# ALL 3 SIMILARITIES

import pandas as pd
import numpy as np
import time
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim

# Load dataset
df = pd.read_csv("/content/sample_data/insta dataset new - Sheet1.csv")  # Ensure correct dataset path

# Convert necessary columns to numeric (handling percentage ER)
def convert_to_numeric(value):
    if isinstance(value, str):
        if '%' in value:
            return float(value.replace('%', ''))
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'K' in value:
            return float(value.replace('K', '')) * 1e3
        elif value == '-':
            return np.nan
    return value

df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)

# Step 1: Clustering based on engagement metrics
start_time = time.time()
X = df[['FOLLOWERS', 'ER']].fillna(0)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)
clustering_time = time.time() - start_time

# Step 2: CNN Model for Feature Extraction
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(16 * (input_size // 2), num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

# Training CNN Model
start_time = time.time()
input_size = 2
num_classes = 5
model = CNNModel(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(50):
    inputs = torch.randn(100, 1, input_size)
    targets = torch.randint(0, num_classes, (100,))
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
cnn_time = time.time() - start_time

# Step 3: Cosine & Jaccard Similarity Calculation
start_time = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['NAME'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)
df['cosine_score'] = cosine_sim.mean(axis=1)

# Jaccard Similarity
def jaccard_similarity(a, b):
    set_a = set(a.lower().split())
    set_b = set(b.lower().split())
    return len(set_a.intersection(set_b)) / len(set_a.union(set_b)) if len(set_a.union(set_b)) > 0 else 0

df['jaccard_score'] = df['NAME'].apply(lambda x: np.mean([jaccard_similarity(x, y) for y in df['NAME']]))
similarity_time = time.time() - start_time

# Step 4: Common Neighbors Calculation (Graph-based similarity)
start_time = time.time()
G = nx.Graph()
edges = [(df.iloc[i]['NAME'], df.iloc[j]['NAME']) for i in range(len(df)) for j in range(i + 1, len(df)) if i != j]
G.add_edges_from(edges)

def common_neighbors_score(node):
    return np.mean([len(list(nx.common_neighbors(G, node, neighbor))) for neighbor in G.neighbors(node)]) if G.has_node(node) else 0

df['common_neighbors'] = df['NAME'].apply(lambda x: common_neighbors_score(x))
graph_time = time.time() - start_time

# Step 5: Weighted Score Calculation
start_time = time.time()
df['weighted_score'] = df['FOLLOWERS']
final_score_time = time.time() - start_time

# Step 6: Final Score Calculation & Metrics
start_time = time.time()
df['final_score'] = (
    (df['weighted_score'] * 0.6) +
    (df['cosine_score'] * 0.16) +
    (df['jaccard_score'] * 0.14) +
    (df['common_neighbors'] * 0.1)
)

df['predicted_rank'] = df['final_score'].rank(ascending=False)
df['actual_rank'] = df.index + 1

top_10_percent = int(len(df) * 0.1)
df['actual_top'] = df['actual_rank'] <= top_10_percent
df['predicted_top'] = df['predicted_rank'] <= top_10_percent

y_true = df['actual_top'].astype(int)
y_pred = df['predicted_top'].astype(int)

# Compute Metrics
precision = precision_score(y_true, y_pred) * 100
accuracy = accuracy_score(y_true, y_pred) * 100
f1 = f1_score(y_true, y_pred) * 100
recall = recall_score(y_true, y_pred) * 100

# Specificity Calculation
tn = ((y_true == 0) & (y_pred == 0)).sum()
fp = ((y_true == 0) & (y_pred == 1)).sum()
specificity = (tn / (tn + fp)) * 100 if (tn + fp) > 0 else 0

metrics_time = time.time() - start_time

# Total Execution Time
total_execution_time = clustering_time + cnn_time + similarity_time + graph_time + final_score_time + metrics_time

# # Display the final rankings and metrics
print("\nUpdated Rankings - Key Influencers:\n", df[['NAME', 'final_score', 'predicted_rank']].sort_values(by='predicted_rank'))
# print(f"\nPrecision: {precision:.2f}%")
# print(f"Accuracy: {accuracy:.2f}%")
# print(f"F1-Score: {f1:.2f}%")
# print(f"Recall: {recall:.2f}%")
# print(f"Specificity: {specificity:.2f}%\n")

# Display execution time breakdown
print(f"Total Execution Time: {total_execution_time:.4f} seconds")
print(f"Step 1 (Clustering): {clustering_time:.4f} seconds")
print(f"Step 2 (CNN Training): {cnn_time:.4f} seconds")
print(f"Step 3 (Similarity Calculation): {similarity_time:.4f} seconds")
print(f"Step 4 (Graph Analysis): {graph_time:.4f} seconds")
print(f"Step 5 (Final Score Calculation): {final_score_time:.4f} seconds")
print(f"Step 6 (Metrics Calculation): {metrics_time:.4f} seconds")

  df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)



Updated Rankings - Key Influencers:
                                  NAME   final_score  predicted_rank
0           Virat Kohli\n@virat.kohli  1.620001e+08             1.0
1        Narendra Modi\n@narendramodi  5.544010e+07             2.0
2           Alia Bhatt 💛\n@aliaabhatt  5.172010e+07             3.0
3          Katrina Kaif\n@katrinakaif  4.824010e+07             4.0
4    दीपिका पादुकोण\n@deepikapadukone  4.824010e+07             5.0
..                                ...           ...             ...
987         Mehak Sayal\n@mehakgupta_  8.400998e+05           994.0
966                 ALOK\n@alokvmenon  8.400998e+05           994.0
970         Sharwanand\n@imsharwanand  8.400998e+05           994.0
972            Itsmecutie\n@nira_jain  8.400998e+05           994.0
971     Actor Soori\n@soorimuthuchamy  8.400998e+05           994.0

[1000 rows x 3 columns]
Total Execution Time: 197.2202 seconds
Step 1 (Clustering): 0.0308 seconds
Step 2 (CNN Training): 0.1039 seconds
Step 3 (

In [None]:
# COMMON NEIGHBOURS
import pandas as pd
import numpy as np
import time
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix
import networkx as nx
from numba import njit

# Measure execution time
start_time = time.time()

# Load dataset
df = pd.read_csv("/content/sample_data/insta dataset new - Sheet1 (2).csv")  # Update file path

# Convert necessary columns to numeric
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.replace(',', '')  # Remove commas
        if '%' in value:
            return float(value.replace('%', '')) / 100  # Convert percentage to decimal
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'K' in value:
            return float(value.replace('K', '')) * 1e3
        elif value == '-':
            return np.nan  # Handle missing values
        try:
            return float(value)
        except ValueError:
            return np.nan
    return value

df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)

# Clustering
X = df[['FOLLOWERS', 'ER']].fillna(0)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)

# ---- Optimized Graph-Based Common Neighbors Calculation ----
G = nx.Graph()
G.add_nodes_from(df['NAME'])

# Create an adjacency matrix for faster lookup
adj_matrix = np.zeros((len(df), len(df)), dtype=np.float64) # Change the data type to float64
name_to_index = {name: i for i, name in enumerate(df['NAME'])}

# Create graph edges efficiently
for i in range(len(df)):
    for j in range(i + 1, len(df)):  # Avoid redundant pairs
        adj_matrix[i, j] = adj_matrix[j, i] = 1
        G.add_edge(df.iloc[i]['NAME'], df.iloc[j]['NAME'])

# Compute common neighbors using adjacency matrix
@njit
def compute_common_neighbors(matrix):
    return np.dot(matrix, matrix)  # Fast matrix multiplication

common_neighbors_matrix = compute_common_neighbors(adj_matrix)
df['common_neighbors'] = [common_neighbors_matrix[i, :].sum() for i in range(len(df))]

# Weighted Score Calculation
df['weighted_score'] = (df['FOLLOWERS'] * 1) * (df['ER'] * 1)

# Final Score Calculation
df['final_score'] = (df['weighted_score'] * 0.5) + (df['common_neighbors'] * 0.5)

# Rank influencers
df['rank'] = df['final_score'].rank(ascending=False, method="dense")

# Define Top Influencers (Ground Truth & Prediction)
top_10_percent = int(len(df) * 0.1)
df['actual_top'] = df['ER'].rank(ascending=False, method="dense") <= top_10_percent
df['predicted_top'] = df['final_score'].rank(ascending=False, method="dense") <= top_10_percent

# Convert to binary labels
y_true = df['actual_top'].astype(int)
y_pred = df['predicted_top'].astype(int)

# Compute Metrics
precision = precision_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])  # TN / (TN + FP)

# Measure execution time
execution_time = time.time() - start_time

# Display results
print("\nUpdated Rankings - Key Influencers:\n", df[['NAME', 'final_score', 'rank']].sort_values(by='rank'))
print(f"\nPrecision: {precision:.2%}")
print(f"Accuracy: {accuracy:.2%}")
print(f"F1-Score: {f1:.2%}")
print(f"Recall: {recall:.2%}")
print(f"Specificity: {specificity:.2%}")
print(f"Execution Time: {execution_time:.4f} seconds")

  df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)



Updated Rankings - Key Influencers:
                                              NAME  final_score  rank
12                           M S Dhoni\n@mahi7781    4878752.0   1.0
94      Sushant Singh Rajput\n@sushantsinghrajput    1840892.0   2.0
467                    Ritika Sajdeh\n@ritssajdeh    1762692.0   3.0
75   Sidhu Moosewala (ਮੂਸੇ ਆਲਾ)\n@sidhu_moosewala    1695432.0   4.0
49                       𝑨𝒋𝒆𝒚 𝑵𝒂𝒈𝒂𝒓\n@carryminati    1693472.0   5.0
..                                            ...          ...   ...
619         ⚡F I L M Y S T A A A N⚡\n@filmystaaan          NaN   NaN
628                          Sunita\n@sunita___82          NaN   NaN
742                     filmyakzone\n@filmyakzone          NaN   NaN
758                   𝐏𝐑𝐎𝐌𝐎𝐓𝐈𝐎𝐍\n@intense.records          NaN   NaN
874             fitness_track27\n@fitness_track27          NaN   NaN

[999 rows x 3 columns]

Precision: 46.46%
Accuracy: 88.69%
F1-Score: 44.88%
Recall: 43.40%
Specificity: 94.06%
Execution Time: 52.414

In [None]:
# COSINE SIMILARITY
import pandas as pd
import numpy as np
import time
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

# Load dataset
df = pd.read_csv("/content/sample_data/insta dataset new - Sheet1 (2).csv")

# Convert necessary columns to numeric
def convert_to_numeric(value):
    if isinstance(value, str):
        if '%' in value:
            return float(value.replace('%', ''))
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'K' in value:
            return float(value.replace('K', '')) * 1e3
        elif value == '-':
            return np.nan
        else:
            try:
                return float(value)
            except ValueError:
                return np.nan
    return value

df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)

# Step 1: Clustering based on engagement metrics
start_time = time.time()
X = df[['FOLLOWERS', 'ER']].fillna(0)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)
clustering_time = (time.time() - start_time) * 1000  # Convert to milliseconds

# Step 2: Cosine Similarity Calculation
start_time = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['NAME'])
cosine_sim = cosine_similarity(tfidf_matrix)
df['cosine_score'] = cosine_sim.mean(axis=1)
similarity_time = (time.time() - start_time) * 1000

# Step 3: Ranking Influencers Based on Cosine Similarity
start_time = time.time()
df['final_score'] = df['cosine_score']
df['predicted_rank'] = df['final_score'].rank(ascending=False)
ranking_time = (time.time() - start_time) * 1000

# Step 4: Compare with Actual Dataset Order
start_time = time.time()
df['actual_rank'] = df.index + 1

# Define top influencers as the top 10%
top_n = int(0.1 * len(df))

df['actual_label'] = (df['actual_rank'] <= top_n).astype(int)
df['predicted_label'] = (df['predicted_rank'] <= top_n).astype(int)

# Compute Metrics
precision = precision_score(df['actual_label'], df['predicted_label']) * 100
accuracy = accuracy_score(df['actual_label'], df['predicted_label']) * 100
f1 = f1_score(df['actual_label'], df['predicted_label']) * 100
recall = recall_score(df['actual_label'], df['predicted_label']) * 100

# Compute Specificity
tn = ((df['actual_label'] == 0) & (df['predicted_label'] == 0)).sum()
fp = ((df['actual_label'] == 0) & (df['predicted_label'] == 1)).sum()
specificity = (tn / (tn + fp)) * 100 if (tn + fp) > 0 else 0
metrics_time = (time.time() - start_time) * 1000

# Total Execution Time
total_execution_time = clustering_time + similarity_time + ranking_time + metrics_time

# Display Results
print("\nUpdated Rankings - Key Influencers:\n", df[['NAME', 'final_score', 'predicted_rank']].sort_values(by='predicted_rank'))
print(f"\nPrecision: {precision:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1-Score: {f1:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"Specificity: {specificity:.2f}%\n")

# Display execution time breakdown
print(f"Total Execution Time: {total_execution_time / 1000:.4f} seconds")  # Convert ms to seconds
print(f"Step 1 (Clustering): {clustering_time:.2f} ms")
print(f"Step 2 (Cosine Similarity Calculation): {similarity_time:.2f} ms")
print(f"Step 3 (Ranking Calculation): {ranking_time:.2f} ms")
print(f"Step 4 (Metrics Calculation): {metrics_time:.2f} ms")



Updated Rankings - Key Influencers:
                                            NAME  final_score  predicted_rank
88      Kareena Kapoor Khan\n@kareenakapoorkhan     0.007850             1.5
253  Kareena Kapoor Khan\n@therealkareenakapoor     0.007850             1.5
15                 Ranveer Singh\n@ranveersingh     0.007550             3.0
302                  Ritika Singh\n@ritika_offl     0.007492             4.0
932             Ankita Singh\n@ankitasingh_2910     0.007438             5.0
..                                          ...          ...             ...
210        Mankirt Aulakh (ਔਲਖ)\n@mankirtaulakh     0.001001           850.0
662                 Swami Ramdev\n@swaamiramdev     0.001001           850.0
659                Srishty Rode\n@srishtyrode24     0.001001           850.0
200                Harsh Beniwal\n@harshbeniwal     0.001001           850.0
691                  MANJUL KHATTAR\n@manjullll     0.001001           850.0

[999 rows x 3 columns]

Precision: 17

  df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)


In [None]:
# JACCARD SIMILARITY
import pandas as pd
import numpy as np
import time
from sklearn.metrics import jaccard_score, precision_score, accuracy_score, f1_score, recall_score

# Load dataset
df = pd.read_csv("/content/sample_data/insta dataset new - Sheet1 (2).csv")

# Convert necessary columns to numeric (handling percentage values)
def convert_to_numeric(value):
    if isinstance(value, str):
        if '%' in value:
            return float(value.replace('%', ''))
        elif 'M' in value:
            return float(value.replace('M', '')) * 1e6
        elif 'K' in value:
            return float(value.replace('K', '')) * 1e3
        elif value == '-':  # Handle hyphens as missing values
            return np.nan
        else:
            try:
                return float(value)
            except ValueError:
                return np.nan  # Handle other invalid values
    return value

df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)

# Step 1: Jaccard Similarity Calculation
start_time = time.time()

def jaccard_similarity(a, b):
    set_a = set(a.lower().split())
    set_b = set(b.lower().split())
    intersection = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    return intersection / union if union else 0

df['jaccard_score'] = df['NAME'].apply(lambda x: np.mean([jaccard_similarity(x, y) for y in df['NAME']]))
similarity_time = (time.time() - start_time) * 1000  # Convert to milliseconds

# Step 2: Assign Labels
start_time = time.time()
threshold_pred = df['jaccard_score'].quantile(0.90)  # Top 10% influencers by Jaccard similarity
df['predicted_label'] = (df['jaccard_score'] >= threshold_pred).astype(int)  # 1 for influencers, 0 otherwise
labeling_time = (time.time() - start_time) * 1000

# ✅ Step 3: Create Actual Labels (Ground Truth)
start_time = time.time()
threshold_actual = df['FOLLOWERS'].quantile(0.90)  # Top 10% influencers by Followers
df['actual_label'] = (df['FOLLOWERS'] >= threshold_actual).astype(int)  # 1 for influencers, 0 otherwise
ground_truth_time = (time.time() - start_time) * 1000

# Step 4: Compute Precision, Accuracy, F1-score, Recall, and Specificity
start_time = time.time()
predicted = df['predicted_label']
actual = df['actual_label']

precision = precision_score(actual, predicted) * 100
accuracy = accuracy_score(actual, predicted) * 100
f1 = f1_score(actual, predicted) * 100
recall = recall_score(actual, predicted) * 100

# Compute Specificity
tn = ((actual == 0) & (predicted == 0)).sum()
fp = ((actual == 0) & (predicted == 1)).sum()
specificity = (tn / (tn + fp)) * 100 if (tn + fp) > 0 else 0
metrics_time = (time.time() - start_time) * 1000

# Step 5: Ranking influencers
start_time = time.time()
df['rank'] = df['jaccard_score'].rank(ascending=False)
ranking_time = (time.time() - start_time) * 1000

# Total Execution Time (Convert from ms to seconds)
total_execution_time = (similarity_time + labeling_time + ground_truth_time + metrics_time + ranking_time) / 1000

# Print Metrics
print("\nUpdated Rankings - Key Influencers:\n", df[['NAME', 'jaccard_score', 'rank']].sort_values(by='rank'))
print(f"\nPrecision: {precision:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")
print(f"F1-score: {f1:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"Specificity: {specificity:.2f}%\n")

# Print Execution Time Breakdown
print(f"Total Execution Time: {total_execution_time:.4f} seconds")
print(f"Step 1 (Jaccard Similarity Calculation): {similarity_time:.2f} ms")
print(f"Step 2 (Assigning Labels): {labeling_time:.2f} ms")
print(f"Step 3 (Creating Ground Truth Labels): {ground_truth_time:.2f} ms")
print(f"Step 4 (Metrics Calculation): {metrics_time:.2f} ms")
print(f"Step 5 (Ranking Calculation): {ranking_time:.2f} ms")


  df[['FOLLOWERS', 'ER']] = df[['FOLLOWERS', 'ER']].applymap(convert_to_numeric)



Updated Rankings - Key Influencers:
                                    NAME  jaccard_score   rank
15         Ranveer Singh\n@ranveersingh       0.008519    1.0
932     Ankita Singh\n@ankitasingh_2910       0.008453    2.0
489  Komal Singh\n@komalsingh__official       0.008319    3.0
837       Shweta Singh\n@shweta_singh6_       0.008319    4.0
637      Prabhjot Singh\n@jatt_prabhjot       0.008219    5.5
..                                  ...            ...    ...
236      Hiphop Tamizha\n@hiphoptamizha       0.001001  819.0
237             MONALISA\n@aslimonalisa       0.001001  819.0
634          Melvin Louis\n@melvinlouis       0.001001  819.0
630      Ruhaanika Dhawann\n@ruhaanikad       0.001001  819.0
694         Mamta Mohandas\n@mamtamohan       0.001001  819.0

[999 rows x 3 columns]

Precision: 18.00%
Accuracy: 83.58%
F1-score: 18.00%
Recall: 18.00%
Specificity: 90.88%

Total Execution Time: 2.2861 seconds
Step 1 (Jaccard Similarity Calculation): 2270.22 ms
Step 2 (Assignin