# Crowdfunding Campaign Success Prediction: End-to-End Pipeline
This unified Jupyter notebook integrates the full pipeline for predicting the success of crowdfunding campaigns using:
- Preprocessing and tokenization
- BERT-based embedding extraction
- CBAM-powered autoencoder for embedding compression
- Meta-heuristic feature selection (Genetic Algorithm)
- Final classification using LSTM and GBM


⚠️ Failed to load `Kickstarter_Preprocessing_Notebook.ipynb`: [Errno 2] No such file or directory: '/mnt/data/Kickstarter_Preprocessing_Notebook.ipynb'

⚠️ Failed to load `BERT_Embedding_Extraction_Notebook.ipynb`: [Errno 2] No such file or directory: '/mnt/data/BERT_Embedding_Extraction_Notebook.ipynb'

# CBAM-Powered Autoencoder for BERT Embedding Compression
This notebook compresses high-dimensional BERT embeddings using a convolutional autoencoder with CBAM attention modules.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import pickle
import numpy as np
from tqdm import tqdm


In [None]:
# Load BERT embeddings
with open("bert_embeddings.pkl", "rb") as f:
    data = pickle.load(f)
    X_train = np.array(data['train_embeddings'])
    y_train = np.array(data['y_train'])

# Reshape and transpose for CNN input
X_train = X_train.reshape((-1, 64, 768, 4)).transpose(0, 3, 1, 2)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)

In [None]:
# Define CBAM modules
class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=8):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.fc = nn.Sequential(
            nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        return self.sigmoid(avg_out + max_out)

class SpatialAttention(nn.Module):
    def __init__(self):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size=7, padding=3, bias=False)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        return self.sigmoid(self.conv(x))

class CBAM(nn.Module):
    def __init__(self, in_planes):
        super(CBAM, self).__init__()
        self.ca = ChannelAttention(in_planes)
        self.sa = SpatialAttention()
    def forward(self, x):
        x = x * self.ca(x)
        x = x * self.sa(x)
        return x

In [None]:
# Define CBAM Autoencoder
class CBAMAutoencoder(nn.Module):
    def __init__(self):
        super(CBAMAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(4, 8, 3, stride=2, padding=1), nn.BatchNorm2d(8), nn.ReLU(), CBAM(8),
            nn.Conv2d(8, 16, 3, stride=2, padding=1), nn.BatchNorm2d(16), nn.ReLU(), CBAM(16),
            nn.Conv2d(16, 32, 3, stride=2, padding=1), nn.BatchNorm2d(32), nn.ReLU(), CBAM(32),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.BatchNorm2d(64), nn.ReLU(), CBAM(64)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1), nn.ReLU(),
            nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1), nn.ReLU(),
            nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1, output_padding=1), nn.ReLU(),
            nn.ConvTranspose2d(8, 4, 3, stride=2, padding=1, output_padding=1), nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

In [None]:
# Train the autoencoder
model = CBAMAutoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loader = DataLoader(TensorDataset(X_train_tensor, X_train_tensor), batch_size=8, shuffle=True)

model.train()
for epoch in range(10):
    total_loss = 0
    for x_batch, y_batch in loader:
        optimizer.zero_grad()
        decoded, _ = model(x_batch)
        loss = criterion(decoded, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(loader):.4f}")

In [None]:
# Extract latent features
model.eval()
with torch.no_grad():
    _, latent_features = model(X_train_tensor)
compressed_features = latent_features.view(latent_features.size(0), -1).numpy()

# Save compressed features
with open("cbam_compressed_features.pkl", "wb") as f:
    pickle.dump({"compressed_features": compressed_features, "y_train": y_train}, f)

# Meta-Heuristic Feature Selection for Compressed BERT Embeddings
This notebook applies Genetic Algorithm (GA), Jaya, and Rabbit Optimization Algorithm (ROA) to select optimal features from CBAM-compressed BERT embeddings.

In [None]:
import numpy as np
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
import random


In [None]:
# Load compressed features
with open("cbam_compressed_features.pkl", "rb") as f:
    data = pickle.load(f)
    X = data['compressed_features']
    y = data['y_train']

In [None]:
# Fitness evaluation: train a GBM and return average F1 score on hold-out split
def evaluate_fitness(X, y, feature_mask):
    selected = X[:, feature_mask == 1]
    if selected.shape[1] == 0:
        return 0
    clf = GradientBoostingClassifier()
    X_train, X_val, y_train, y_val = train_test_split(selected, y, test_size=0.3, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    return f1_score(y_val, y_pred)

In [None]:
# Genetic Algorithm
def genetic_algorithm(X, y, pop_size=20, generations=20):
    dim = X.shape[1]
    population = np.random.randint(0, 2, (pop_size, dim))
    for gen in range(generations):
        fitness = [evaluate_fitness(X, y, ind) for ind in population]
        sorted_idx = np.argsort(fitness)[::-1]
        population = population[sorted_idx]
        new_pop = population[:2]  # elitism
        while len(new_pop) < pop_size:
            p1, p2 = population[np.random.randint(0, 10, 2)]
            cross = np.random.randint(1, dim-1)
            child = np.concatenate([p1[:cross], p2[cross:]])
            if np.random.rand() < 0.1:
                child[np.random.randint(0, dim)] ^= 1
            new_pop.append(child)
        population = np.array(new_pop)
    best = population[0]
    best_f1 = evaluate_fitness(X, y, best)
    return best, best_f1

In [None]:
# Run GA
best_features_ga, best_score_ga = genetic_algorithm(X, y)
print(f"GA - Best F1 Score: {best_score_ga:.4f}, Selected Features: {np.sum(best_features_ga)}")

> 📝 The Jaya and ROA algorithms can be implemented similarly. Add them here if needed or use external packages like `mealpy`.

In [None]:
# Save selected feature mask
with open("selected_features_ga.pkl", "wb") as f:
    pickle.dump({"mask": best_features_ga}, f)

# Classification with LSTM and GBM on Selected Features
This notebook loads features selected by the Genetic Algorithm and evaluates classification performance using LSTM and GBM models.

In [None]:
import numpy as np
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input


In [None]:
# Load compressed features and selected feature mask
with open("cbam_compressed_features.pkl", "rb") as f:
    data = pickle.load(f)
    X = data['compressed_features']
    y = data['y_train']

with open("selected_features_ga.pkl", "rb") as f:
    mask = pickle.load(f)['mask']

X_selected = X[:, mask == 1]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

## Gradient Boosting Classifier (GBM)

In [None]:
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
print("GBM Accuracy:", accuracy_score(y_test, y_pred_gbm))
print("GBM F1 Score:", f1_score(y_test, y_pred_gbm))

## Long Short-Term Memory (LSTM) Network

In [None]:
# Reshape for LSTM [samples, timesteps, features] where timesteps = 1
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model = Sequential([
    Input(shape=(1, X_train.shape[1])),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_lstm, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=1)

y_pred_lstm = (model.predict(X_test_lstm) > 0.5).astype(int).flatten()
print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("LSTM F1 Score:", f1_score(y_test, y_pred_lstm))