In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd

df = pd.read_csv("./labelled_posts.csv")

# Drop reposts (i.e. engagement == 0 && comments == 0)
no_reposts_df = df[(df['engagement'] != 0) | (df['comments'] != 0)].copy()

texts, labels = no_reposts_df["content"], no_reposts_df["personal_exp"]


In [None]:
import keras_tuner as kt
from sklearn.model_selection import KFold
from tensorflow.keras import layers, Sequential, optimizers


class LIPostClassifier:
    def __init__(self, inputs, targets, kfold_num=5) -> None:
        self.MAX_EPOCHS = 50

        self.max_features = inputs.str.len().max()
        self.sequence_length = int(0.01 * self.max_features)
        self.vectorizer = None

        self.inputs = inputs.to_numpy()
        self.targets = targets.to_numpy()
        self.kfolder = KFold(n_splits=kfold_num, shuffle=True, random_state=42)

    def create_hypermodel(self, hp):
        HP_OUTPUT_DIM = hp.Int('output_dim', min_value=16,
                               max_value=80, step=16)

        model = Sequential()
        model.add(layers.Embedding(self.max_features + 1,
                  HP_OUTPUT_DIM, input_length=self.sequence_length))
        model.add(layers.Flatten())

        # # Embedding Layer for our data
        # layers.Embedding(self.max_features + 1, 64,
        #                  input_length=self.sequence_length),
        #     layers.Flatten(),

        #     # Hidden layers
        #     layers.Dense(units=HP_INITIAL_DIM, activation='relu'),
        #     layers.Dense(units=hp.Int('units', min_value=16,
        #                  max_value=128, step=16), activation='relu'),

        #     # Output Layer
        #     layers.Dense(1, activation='sigmoid')
        # ])

        for i in range(hp.Int("num_layers", 1, 3)):
            model.add(
                layers.Dense(
                    # Tune number of units separately.
                    units=hp.Int(f"units_{i}", min_value=16,
                                 max_value=128, step=16),
                    activation="relu"
                )
            )

        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(loss="binary_crossentropy",
                      optimizer=optimizers.legacy.SGD(
                          learning_rate=0.1),
                      metrics=["accuracy"])

        return model

    def model_eval(self):
        tuner = kt.BayesianOptimization(self.create_hypermodel,
                                        objective='val_accuracy',
                                        directory="archive",
                                        project_name="bayes",
                                        overwrite=True
                                        )

        # K-fold Cross Validation model evaluation
        fold_no = 1

        # Define per-fold score containers
        acc_per_fold = []
        loss_per_fold = []

        for train, val in self.kfolder.split(self.inputs, self.targets):
            # Learn the vocabulary of the training data
            self.vectorizer = layers.TextVectorization(
                max_tokens=self.max_features,
                output_mode='int',
                output_sequence_length=self.sequence_length)
            self.vectorizer.adapt(self.inputs[train])

            # If not using vectorization layer
            X_train = self.vectorizer(self.inputs[train])
            y_train = self.targets[train]
            tuner.search(X_train, y_train,
                         validation_split=0.2, verbose=False)

            # Get the optimal hyperparameters
            best_hps = tuner.get_best_hyperparameters()[0]

            # print(f"""
            # The hyperparameter search is complete. The optimal number of units in the first densely-connected
            # layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
            # is {best_hps.get('learning_rate')}.
            # """)

            # Build the model with the optimal hyperparameters and train it on the data for 50 epochs
            model = tuner.hypermodel.build(best_hps)

            history = model.fit(
                X_train, y_train, epochs=self.MAX_EPOCHS, validation_split=0.2, verbose=False)

            val_acc_per_epoch = history.history['val_accuracy']
            best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1

            # Retrain the model on the best hyperparams and epoch
            hypermodel = tuner.hypermodel.build(best_hps)
            hypermodel.fit(X_train, y_train, epochs=best_epoch,
                           validation_split=0.2, verbose=False)

            X_val = self.vectorizer(self.inputs[val])
            y_val = self.targets[val]
            scores = hypermodel.evaluate(
                X_val, y_val, verbose=False)

            acc_per_fold.append(scores[1] * 100)
            loss_per_fold.append(scores[0])

            # Increase fold number
            fold_no = fold_no + 1

        # Return the best hyperparameters and their
        # corr. accuracy and loss metrics
        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
        hp_data = {
            "accuracy": round(np.mean(acc_per_fold), 3),
            "accuracy_std": round(np.std(acc_per_fold), 3),
            "loss": round(np.mean(loss_per_fold), 3),
            'output_dim': best_hps.values['output_dim'],
            'num_layers': best_hps.values['num_layers'],
        }

        total_params = best_hps.values['output_dim']
        last_units = 0
        units = []

        for i in range(3):
            num_units = 0

            try:
                if best_hps.values[f"units_{i}"]:
                    num_units = best_hps.values[f"units_{i}"]
                    units.append(str(num_units))

                    total_params += num_units
                    last_units = num_units
            except:
                pass

        hp_data["units"] = units

        total_params += last_units + 1
        hp_data["total_params"] = total_params

        return hp_data

    def find_best_model(self, tries):
        data = []
        for _ in range(tries):
            data.append(self.model_eval())

        print("Accuracy (± std)\t", "Loss\t", "Embedding Dim.\t",
              "No. of Layers\t", "Total Params\t", "Hidden Layers\t")
        for hp_data in data:
            print(f"{hp_data['accuracy']} (±{hp_data['accuracy_std']})\t\t",
                  f"{hp_data['loss']}\t",
                  f"{hp_data['output_dim']}\t\t",
                  f"{hp_data['num_layers']}\t\t",
                  f"{hp_data['total_params']}\t\t",
                  f"{' → '.join(hp_data['units'])}\t",
                  )


In [None]:
classifier = LIPostClassifier(texts, labels)

classifier.find_best_model(10)

{'learning_rate': 0.1, 'output_dim': 32, 'num_layers': 1, 'units_0': 128, 'units_1': 32, 'units_2': 64, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

{'learning_rate': 0.1, 'output_dim': 32, 'num_layers': 1, 'units_0': 128, 'units_1': 32, 'units_2': 64, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

```
{'learning_rate': 0.1, 'output_dim': 48, 'num_layers': 2, 'units_0': 80, 'units_1': 112, 'units_2': 96, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 29, 48)            139584    
                                                                 
 flatten (Flatten)           (None, 1392)              0         
                                                                 
 dense (Dense)               (None, 80)                111440    
                                                                 
 dense_1 (Dense)             (None, 112)               9072      
                                                                 
 dense_2 (Dense)             (None, 1)                 113       
                                                                 
=================================================================
Total params: 260209 (1016.44 KB)
Trainable params: 260209 (1016.44 KB)
Non-trainable params: 0 (0.00 Byte)
```

```
{'learning_rate': 0.05, 'output_dim': 80, 'num_layers': 2, 'units_0': 128, 'units_1': 32, 'units_2': 80, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 29, 80)            232640    
                                                                 
 flatten (Flatten)           (None, 2320)              0         
                                                                 
 dense (Dense)               (None, 128)               297088    
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 533889 (2.04 MB)
Trainable params: 533889 (2.04 MB)
Non-trainable params: 0 (0.00 Byte)
```


---

```
Average scores for all folds:
> Accuracy: 75.868079662323 (+- 4.182610074413319)
> Loss: 0.6991891145706177
------------------------------------------------------------------------
{'learning_rate': 0.1, 'output_dim': 48, 'num_layers': 3, 'units_0': 112, 'units_1': 32, 'units_2': 128, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 29, 48)            139584    
                                                                 
 flatten (Flatten)           (None, 1392)              0         
                                                                 
 dense (Dense)               (None, 112)               156016    
                                                                 
 dense_1 (Dense)             (None, 32)                3616      
                                                                 
 dense_2 (Dense)             (None, 128)               4224      
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
=================================================================
Total params: 303569 (1.16 MB)
Trainable params: 303569 (1.16 MB)
```

---
```
Average scores for all folds:
> Accuracy: 74.80519533157349 (+- 1.2396930942487876)
> Loss: 0.6298975884914398
------------------------------------------------------------------------
{'learning_rate': 0.1, 'output_dim': 80, 'num_layers': 3, 'units_0': 80, 'units_1': 64, 'units_2': 64, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0068'}
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 29, 80)            232640    
                                                                 
 flatten (Flatten)           (None, 2320)              0         
                                                                 
 dense (Dense)               (None, 80)                185680    
                                                                 
 dense_1 (Dense)             (None, 64)                5184      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
=================================================================
Total params: 427729 (1.63 MB)
Trainable params: 427729 (1.63 MB)
Non-trainable params: 0 (0.00 Byte)
```
---

---
```
Average scores for all folds:
> Accuracy: 77.69993305206299 (+- 4.263463988494678)
> Loss: 0.527130925655365
{'output_dim': 16, 'num_layers': 1, 'units_0': 64}
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 29, 16)            46528     
                                                                 
 flatten (Flatten)           (None, 464)               0         
                                                                 
 dense (Dense)               (None, 64)                29760     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
=================================================================
Total params: 76353 (298.25 KB)
Trainable params: 76353 (298.25 KB)
```