<a href="https://colab.research.google.com/github/jpcompartir/BertopicR/blob/main/campaign_prediction_neural_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

We're building a model to preedict campaign metrics on social media data. The current approach uses a truncated mean, the RMSE is ~15k. So far with traditional ML approaches we haven't been able to re-create such a model. This could be for many reasons - bad features, data leakage in the current approach (?), highly skewed data, and some erroneous, confounding data points in our training, test and validation splits.

So, we're going to turn to a Neural Network (NN) approach. NN's have a reputation for being data hungry and expensive to train. The great strength of NN's is their ability to extract features, features which otherwise take a long time to create and evaluate. Plus, hardware accelerators have made it much more feasible to train NNs.



In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import os
from tqdm import tqdm
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='engagement_training_log.log', filemode='w')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Set up routing to GPU when possible - CUDA not mps, so use Colab.

In [4]:
def median_absolute_error(y_true, y_pred):
    """Calculate median absolute error for robust performance evaluation."""
    return torch.median(torch.abs(y_pred - y_true))

In [5]:
def mean_percentage_deviation(y_true, y_pred):
    """Calculate mean percentage deviation to avoid division by zero."""
    epsilon = 1e-8
    deviation = (y_pred - y_true) / (y_true + epsilon)
    return torch.mean(torch.abs(deviation)) * 100

In [70]:
y = df["engagements"].values
y

array([ 299, 1983,   65, ..., 1891, 1268,  380])

In [84]:
X = df.drop(columns=["engagements"])


In [85]:
numeric_features = ['audience']
categorical_features = ['channel', 'type', 'country']

numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

    # Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
  transformers=[
          ('num', numeric_transformer, numeric_features),
          ('cat', categorical_transformer, categorical_features)
])

X_prepared = preprocessor.fit_transform(X)

In [86]:
X_train, X_test, y_train,  y_test = train_test_split(X_prepared, y, test_size = 0.4, random_state = 42)

Unnamed: 0,0
0,"(0, 0)\t-0.7026896973252617\n (0, 2)\t1.0\n..."
1,"(0, 0)\t-0.6727465991452916\n (0, 2)\t1.0\n..."
2,"(0, 0)\t1.5630047316258058\n (0, 2)\t1.0\n ..."
3,"(0, 0)\t3.296852702427881\n (0, 2)\t1.0\n ..."
4,"(0, 0)\t-0.6344907265658346\n (0, 1)\t1.0\n..."
...,...
90818,"(0, 0)\t-0.6898569409624173\n (0, 2)\t1.0\n..."
90819,"(0, 0)\t-0.7169483155061998\n (0, 2)\t1.0\n..."
90820,"(0, 0)\t1.7237421343795212\n (0, 1)\t1.0\n ..."
90821,"(0, 0)\t1.112432397108161\n (0, 2)\t1.0\n ..."


In [87]:
X_train_np = X_train.toarray()

In [88]:
X_train_np

array([[-0.7026897 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.6727466 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.56300473,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.72374213,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.1124324 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.56295524,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ]])

In [101]:
y

array([ 299, 1983,   65, ..., 1891, 1268,  380])

In [77]:
X_train_np.shape

(90823, 69)

In [78]:
 X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)

In [102]:
def prepare_data(df, test_size=0.2, val_size=0.25, stratify_col='audience'):
    """
    Prepare data for training with stratification and normalisation.

    Parameters:
        df (DataFrame): The data frame containing features and target.
        test_size (float): The proportion of the dataset to include in the test split.
        val_size (float): The proportion of the training dataset to include in the validation split.
        stratify_col (str): The column to use for stratified sampling.

    Returns:
        Tuple of train, validation, and test sets ready to be handed to the DataLoader
    """

    # Separate target variable
    y = df['engagements'].values
    X = df.drop(columns=['engagements'])

    numeric_features = ['audience']
    categorical_features = ['channel', 'type', 'country']

    # At the moment we'll have data leakage, but fixing something else first
    # Create transformers for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers into a single ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Apply transformations
    X_prepared = preprocessor.fit_transform(X)

    X_train, X_test, y_train,  y_test = train_test_split(X_prepared, y, test_size = 0.4, random_state = 42)

    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

    # After splitting to avoid data leakage & transform not fit_transform on val/test sets
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_val_scaled = scaler.transform(X_val)
    # X_test_scaled = scaler.transform(X_test)

    # Convert to arrays before tensor
    X_train_np = X_train.toarray()
    # y_train_np = y_train.toarray()

    X_val_np = X_val.toarray()
    # y_val_np = y_val.toarray()

    X_test_np = X_test.toarray()
    # y_test_np = y_test.toarray()


    # Convert arrays to tensors
    X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train,dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val_np, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Create TensorDatasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    return train_dataset, val_dataset, test_dataset

In [7]:
class EngagementNN(nn.Module):
    def __init__(self, num_channels, num_post_types, num_countries, emb_dim=10):
        super(EngagementNN, self).__init__()
        # Embeddings for categorical variables
        self.channel_embedding = nn.Embedding(num_channels, emb_dim)
        self.post_type_embedding = nn.Embedding(num_post_types, emb_dim)
        self.country_embedding = nn.Embedding(num_countries, emb_dim)

        # Calculating the total feature dimension after concatenation:
        # Assume audience size is a single feature, plus each embedding dimension
        total_feature_dim = 1 + 3 * emb_dim

        # Dense layers with create_layer func
        self.layer1 = self.create_layer(total_feature_dim, 128)
        self.layer2 = self.create_layer(128, 64)
        self.layer3 = self.create_layer(64, 32)
        self.layer4 = self.create_layer(32, 16)

        # Output layer drops the batch norm + activation
        self.output = nn.Linear(16, 1)

    def create_layer(self, in_features, out_features, dropout_rate=0.1):
        """ Helper method to create dense, sequential layer block with batch normalisation, ReLU, and dropout"""
        layer = nn.Sequential(
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        return layer

    def forward(self, x_numeric, x_channel, x_post_type, x_country):
        """
        Parameters:
            x_numeric (Tensor): Tensor containing numeric features, expected shape [batch_size, num_numeric_features].
            x_channel (Tensor): Tensor of channel indices, expected shape [batch_size].
            x_post_type (Tensor): Tensor of post type indices, expected shape [batch_size].
            x_country (Tensor): Tensor of country indices, expected shape [batch_size].
        """
        # Embedding layers
        channel_embedded = self.channel_embedding(x_channel)
        post_type_embedded = self.post_type_embedding(x_post_type)
        country_embedded = self.country_embedding(x_country)

        # Concatenate all features
        x = torch.cat((x_numeric, channel_embedded, post_type_embedded, country_embedded), dim=1)

        # Pass through dense layers
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.output(x)
        return x

In [8]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=5, verbose=True, delta=0, path='checkpoint_model.pth'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 5
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: True
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint_model.pth'
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.path = path
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Saves model when validation loss decrease."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} to {val_loss:.6f}).  Saving model...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [9]:
def train_model(model, train_loader, val_loader, criterion, optimiser, device, epochs=20, path="engagement_model_best_path.pth"):
    model.to(device)
    early_stopping = EarlyStopping(patience=5, verbose=True, delta=0.01, path=path)

    for epoch in range(epochs):
        model.train()
        train_losses = []
        loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False, desc=f"Epoch {epoch+1}/{epochs}")
        for batch_idx, (data, target) in loop:
            data, target = data.to(device), target.to(device)
            optimiser.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimiser.step()
            train_losses.append(loss.item())

            if (batch_idx + 1) % 100 == 0:
                current_loss = np.mean(train_losses[-100:])
                loop.set_postfix(loss=current_loss)

        avg_train_loss = np.mean(train_losses)
        logging.info(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}')

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_loss = criterion(output, target)
                val_losses.append(val_loss.item())

        avg_val_loss = np.mean(val_losses)
        logging.info(f'Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss}')
        print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            logging.info("Early stopping triggered")
            print("Early stopping triggered")
            break

    # Load the best model
    model.load_state_dict(torch.load(path))
    logging.info("Training complete. Best model loaded.")

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
df = pd.read_csv("/content/drive/My Drive/colab_data/nn_outliers_removed_10k_sample.csv")
df

Unnamed: 0,audience,channel,type,country,engagements
0,11420,instagram,post,unknown,299
1,14203,instagram,post,ES,1983
2,33181,instagram,story,unknown,65
3,53196,instagram,post,unknown,515
4,418,instagram,story,unknown,9
...,...,...,...,...,...
151367,132220,instagram,post,MX,341
151368,177200,tiktok,video,CO,2264
151369,99855,instagram,reel,PT,1891
151370,13911,instagram,post,ES,1268


In [103]:
train_dataset, val_dataset, test_dataset = prepare_data(df, test_size=0.2, val_size=0.25)

In [104]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7fa6075c5930>

In [105]:
# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)



In [106]:
# Initialize model, loss, and optimiser
model = EngagementNN(num_channels=5, num_post_types=10, num_countries=20)
model.to(device)

optimiser = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
# Assuming criterion is initialized as MSE for loss during training and validation
# If not, you can directly use torch.nn.MSELoss for computation
mse_criterion = torch.nn.MSELoss()

In [108]:
train_model(model, train_loader, val_loader, criterion, optimiser, device, epochs=20, path="engagement_model_best_path.pth")



TypeError: EngagementNN.forward() missing 3 required positional arguments: 'x_channel', 'x_post_type', and 'x_country'