In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Download stopwords if not already done
nltk.download('stopwords')

# Load the Instagram dataset
df = pd.read_csv('/content/sample_data/top_insta_influencers_data.csv')  # Replace with your actual dataset path

# Function to convert values like '3.3k' to 3300
def convert_to_numeric(value):
    if isinstance(value, str):
        if 'k' in value:
            return float(value.replace('k', '')) * 1000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1000000
        elif 'b' in value:
            return float(value.replace('b', '')) * 1000000000
        else:
            try:
                return float(value)
            except ValueError:
                return np.nan  # Handle cases that can't be converted
    else:
        return value  # If already numeric, return as is

# Apply the conversion to the numerical columns
for col in ['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes']:
    df[col] = df[col].apply(convert_to_numeric)


# Normalize numerical columns
def normalize_features(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

numerical_columns = ['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes']
df = normalize_features(df, numerical_columns)

# TF-IDF Vectorization for influencer names
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['channel_info'].astype(str))

# Combine features like Followers, Avg Likes, and Total Likes for similarity
df['combined_features'] = df['followers'] + df['avg_likes'] + df['total_likes']  # Simple combination, can be weighted
combined_matrix = np.hstack((tfidf_matrix.toarray(), df[['followers', 'avg_likes', 'total_likes']].values))

# Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(combined_matrix)

# Sum cosine similarities for each influencer (indicates their influence)
df['influence_score'] = cosine_sim_matrix.sum(axis=1)

# Rank influencers by influence score
df['rank'] = df['influence_score'].rank(ascending=False)

# Display top-ranked influencers
top_influencers = df[['channel_info', 'influence_score', 'rank', 'country']].sort_values(by='rank')
print(top_influencers.head())


    channel_info  influence_score  rank        country
0      cristiano        22.395805   1.0          Spain
1    kyliejenner        21.659762   2.0  United States
140          j.m        21.161378   3.0            NaN
3    selenagomez        18.335557   4.0  United States
2       leomessi        18.284884   5.0            NaN


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
