<a href="https://colab.research.google.com/github/jpcompartir/BertopicR/blob/main/campaign_preds_tf_keras_larger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, median_absolute_error
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import Callback
from tqdm.notebook import tqdm
from google.colab import drive


In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = '/content/drive/My Drive/colab_data/nn_outliers_removed_10k_sample.csv' # Didn't actually sample this to 10k so it's all of the data
data = pd.read_csv(file_path)

data.head(), data.describe()

(   audience    channel   type  country  engagements
 0     11420  instagram   post  unknown          299
 1     14203  instagram   post       ES         1983
 2     33181  instagram  story  unknown           65
 3     53196  instagram   post  unknown          515
 4       418  instagram  story  unknown            9,
             audience    engagements
 count  151372.000000  151372.000000
 mean    60981.752861    1571.414436
 std     70133.254881    2212.216245
 min         8.000000       1.000000
 25%     14300.000000     239.000000
 50%     30600.000000     714.000000
 75%     80800.000000    1862.000000
 max    335341.000000   13338.000000)

In [6]:
features = data.drop('engagements', axis=1)
target = data['engagements']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['audience']),
        ('cat', OneHotEncoder(), ['channel', 'type', 'country'])
    ]
)

In [7]:
X = preprocessor.fit_transform(features)
y = target.values

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [8]:
class TQDMProgressBar(Callback):
    def __init__(self, epoch):
        super().__init__()
        self.epoch = epoch
        self.progress_bar = None

    def on_epoch_begin(self, epoch, logs=None):
        self.progress_bar = tqdm(total=self.epoch, desc='Training Progress')

    def on_epoch_end(self, epoch, logs=None):
        self.progress_bar.set_postfix(logs)
        self.progress_bar.update(1)

    def on_train_end(self, logs=None):
        self.progress_bar.close()

In [9]:
class MetricsCallback(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_pred = self.model.predict(self.validation_data[0])
        val_true = self.validation_data[1]
        median_ae = median_absolute_error(val_true, val_pred)
        mse = mean_squared_error(val_true, val_pred)  # Calculating MSE
        percentage_error = np.mean(np.abs((val_true - val_pred) / val_true)) * 100
        # Adding MSE, median absolute error, and percentage error to logs
        logs['median_ae'] = median_ae
        logs['mse'] = mse
        logs['percentage_error'] = percentage_error


In [13]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

In [14]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Instantiate the progress bar callback
progress_bar = TQDMProgressBar(epoch=200)

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    verbose=0,
    callbacks=[progress_bar]
)

print(history.history.keys())

# Evaluate the model
test_loss = model.evaluate(X_test, y_test)
print(f'Test loss: {test_loss}')

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

Training Progress:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt

# Example of plotting MSE and Median AE
plt.plot(history.history['mse'], label='MSE')
plt.plot(history.history['median_ae'], label='Median AE')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Model Error Progression')
plt.legend()
plt.show()

In [None]:
model.save('/content/drive/My Drive/colab_models/keras_prediction_model.h5')
model.save_weights('my_model_weights.h5')

Loading the model back later to predict:

In [None]:
from tensorflow.keras.models import load_model
model = load_model('path_to_my_model.h5')

In [None]:
predictions = model.predict(X_new)
print(predictions)