In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict

# Load dataset from CSV
df = pd.read_csv('/content/instagram_data_india.csv')

# Debug: Print raw dataframe info
print("Raw DataFrame Info:")
print(df.info())
print(df.head())

# Ensure correct column names
df.columns = [col.strip() for col in df.columns]
print("Column Names After Stripping:", df.columns)

# Function to clean numerical columns
def clean_numeric_column(col):
    return pd.to_numeric(col.replace({'M': '*1e6', '%': ''}, regex=True).map(pd.eval), errors='coerce')

# Convert numerical columns safely
df["FOLLOWERS"] = clean_numeric_column(df["FOLLOWERS"])
df["ER"] = clean_numeric_column(df["ER"])
df["POTENTIAL REACH"] = clean_numeric_column(df["POTENTIAL REACH"])

# Debug: Print before dropping NaNs
print("Data Before Dropping NaN:")
print(df.head())

# Drop rows with missing values
df = df.dropna(subset=["FOLLOWERS", "ER", "POTENTIAL REACH", "TOPIC OF INFLUENCE"])

# Debug: Print after dropping NaNs
print("Data After Dropping NaN:")
print(df.head())

# Ensure no empty dataset
if df.empty:
    raise ValueError("Dataset is empty after preprocessing! Check CSV formatting and missing values.")

# Normalize numerical features
features = df[["FOLLOWERS", "ER", "POTENTIAL REACH"]].values
features = (features - np.min(features, axis=0)) / (np.max(features, axis=0) - np.min(features, axis=0))  # Normalize

# Category Mapping
category_mapping = {
    "Sports": ["Cricket", "Athlete", "Gym", "Sports"],
    "Politics": ["Politics"],
    "Entertainment": ["Celebrity", "Actors", "Actor", "Singers", "Music", "Musician", "Film", "TV Host", "Dance", "Comedian", "Artist", "Producers"],
    "Beauty and Lifestyle": ["Fashion", "Accessories", "Beauty", "Self Care", "Fitness", "Health", "Lifestyle"],
    "Business and Finance": ["Business", "Finance", "Personal Finance"],
    "Technology": ["Geek", "Computer", "Company", "Auto and Vehicles"],
    "Food and Travel": ["Food", "Travel", "Romance and Wedding", "Home and Garden"],
}

# Manual category fixes for well-known influencers
manual_category_fixes = {
    "M S Dhoni": "Sports",
    "Rohit Sharma": "Sports",
    "Surya Kumar Yadav (SKY)": "Sports",
    "Salman Khan": "Entertainment",
    "Shah Rukh Khan": "Entertainment",
    "Shahid Kapoor": "Entertainment",
    "Sonu Sood": "Entertainment",
    "Deepika Padukone": "Entertainment",
    "Katrina Kaif": "Entertainment",
    "Narendra Modi": "Politics"
}

# Improved Function to Map Topics to Categories
def map_to_category(row):
    topic = row["TOPIC OF INFLUENCE"].strip().lower()
    name = row["NAME"].strip()

    # Manual category fixes for well-known influencers
    if name in manual_category_fixes:
        return manual_category_fixes[name]

    # Ensure sports figures are correctly classified
    if "cricket" in topic or "athlete" in topic or "sports" in topic:
        return "Sports"

    # Check category mapping
    for category, keywords in category_mapping.items():
        if any(keyword.lower() in topic for keyword in keywords):
            return category

    # If no match, classify as "Entertainment" instead of "Other"
    return "Entertainment"

# Apply category mapping
df["CATEGORY"] = df.apply(map_to_category, axis=1)

# Create a graph for influencer relationships
G = nx.Graph()
for index, row in df.iterrows():
    G.add_node(row['NAME'], category=row['CATEGORY'], followers=row['FOLLOWERS'], er=row['ER'], reach=row['POTENTIAL REACH'])

# Find maximal cliques
cliques = list(nx.find_cliques(G))

# Rank influencers per category based on clique participation
category_ranking = defaultdict(list)
for clique in cliques:
    for node in clique:
        category = G.nodes[node]['category']
        influence_score = G.nodes[node]['followers'] * G.nodes[node]['er'] * G.nodes[node]['reach']
        category_ranking[category].append((node, influence_score))

# Sort and get top 5 per category
for category in category_ranking:
    category_ranking[category] = sorted(category_ranking[category], key=lambda x: x[1], reverse=True)[:5]

# Display results
print("Top 5 Influencers per Category:")
for category, influencers in category_ranking.items():
    print(f"\nCategory: {category}")
    for rank, (name, score) in enumerate(influencers, start=1):
        print(f"{rank}. {name} - Influence Score: {score:.2f}")

# Compute influence scores
df["INFLUENCE_SCORE"] = df["FOLLOWERS"] * df["ER"] * df["POTENTIAL REACH"]

# Convert features to tensor
data = torch.tensor(features, dtype=torch.float32).unsqueeze(1)  # Reshape to (samples, seq_len, features)

# Convert targets to tensor, ensuring same length as data
targets = torch.tensor(df["INFLUENCE_SCORE"].values, dtype=torch.float32).unsqueeze(1)

# PPT-LSTM Model Definition
def parametric_tanh(x, alpha):
    return torch.tanh(alpha * x)

class PPT_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(PPT_LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.param_alpha = nn.Parameter(torch.tensor(0.5, requires_grad=True))
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        param_activated = parametric_tanh(lstm_out, self.param_alpha)
        output = self.fc(param_activated[:, -1, :])
        return output

# Initialize Model
input_dim = features.shape[1]
hidden_dim = 10
output_dim = 1

model = PPT_LSTM(input_dim, hidden_dim, output_dim)

# Training Function
def train_ppt_lstm(model, data, targets, epochs=50, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        # print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Train Model
train_ppt_lstm(model, data, targets)

Raw DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   #                   100 non-null    int64 
 1   NAME                100 non-null    object
 2   FOLLOWERS           100 non-null    object
 3   ER                  100 non-null    object
 4   COUNTRY             100 non-null    object
 5   TOPIC OF INFLUENCE  100 non-null    object
 6   POTENTIAL REACH     100 non-null    object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB
None
   #                             NAME FOLLOWERS     ER COUNTRY  \
0  1         Virat Kohli @virat.kohli    267.1M  0.02%   India   
1  2      Narendra Modi @narendramodi     87.3M  2.23%   India   
2  3         Alia Bhatt 💛 @aliaabhatt     83.7M  0.02%   India   
3  4        Katrina Kaif @katrinakaif     79.7M  0.87%   India   
4  5  दीपिका पादुकोण @deepikapadukone     78.9M  2.18%   Ind