In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Download stopwords if not already done
nltk.download('stopwords')

# Load the Instagram dataset
df = pd.read_csv('/content/sample_data/top_insta_influencers_data.csv')  # Replace with your actual dataset path

# Function to convert abbreviated numbers to floats
def convert_abbreviated_numbers(value):
    if isinstance(value, str):
        if 'k' in value:
            return float(value.replace('k', '')) * 1000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1000000
        elif 'b'in value:
            return float(value.replace('b', '')) * 1000000000

    return value

# Apply the conversion function to numerical columns
for column in ['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes', 'total_likes']:
    df[column] = df[column].apply(convert_abbreviated_numbers)

# Normalize numerical columns
def normalize_features(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

numerical_columns = ['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes']
df = normalize_features(df, numerical_columns)

# ... (rest of the code remains the same) ...

# Define input features and target variable
X_data = df[['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes']].values
y_data = df['total_likes'].values  # Using total likes as a proxy for influence ranking

# Split data into training and test sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X_data, y_data, np.arange(len(df)), test_size=0.2, random_state=42
)

# Define the TH-DCNN model architecture
def create_th_dcnn_model(input_shape):
    model = models.Sequential()

    # DCNN layers
    model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(input_shape[0], 1)))
    model.add(layers.MaxPooling1D(pool_size=2))
    # Reduced kernel_size and added padding
    model.add(layers.Conv1D(filters=128, kernel_size=2, activation='relu', padding='same'))
    # Change pool_size to 1 to avoid negative dimension
    model.add(layers.MaxPooling1D(pool_size=1))
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))  # For regression (influence score)

    return model

# Reshape input for Conv1D
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Create the model
input_shape = (X_train.shape[1], 1)
model = create_th_dcnn_model(input_shape)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Rank influencers by the predicted influence score
y_pred = model.predict(X_test)

# Combine the results with the original dataset for ranking
df_test = df.iloc[test_indices]  # Get corresponding rows from the original DataFrame for test set
df_test['predicted_influence'] = y_pred

# Rank influencers by the predicted influence score
df_test['rank'] = df_test['predicted_influence'].rank(ascending=False)

# Display top-ranked influencers based on the predicted influence score
print(df_test[['channel_info', 'predicted_influence', 'rank', 'country']].head())


Epoch 1/10


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - loss: 0.0156 - mae: 0.0657 - val_loss: 0.0031 - val_mae: 0.0489
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0081 - mae: 0.0581 - val_loss: 0.0021 - val_mae: 0.0363
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0066 - mae: 0.0404 - val_loss: 0.0021 - val_mae: 0.0315
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.0056 - mae: 0.0395 - val_loss: 0.0016 - val_mae: 0.0291
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.0052 - mae: 0.0375 - val_loss: 0.0014 - val_mae: 0.0266
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0075 - mae: 0.0383 - val_loss: 0.0016 - val_mae: 0.0284
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 0.0034 - mae: 0.0310 - val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predicted_influence'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['rank'] = df_test['predicted_influence'].rank(ascending=False)
