In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Read the data into a dataframe
df = pd.read_csv("cleaned_engagement_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_direct_mentions",
        "total_indirect_mentions",
        "total_likes",
        "total_retweets",
        "total_project_followers",
        "total_indirect_followers",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Neural Network

To get started, we use a very simple classification problem and a very simple multi-layer perceptron architecture.

## Imports

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.datasets import make_classification
from torch import nn

from skorch import NeuralNetClassifier
from skorch.helper import SkorchDoctor

In [3]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [4]:
plt.style.use("seaborn-v0_8")

## Load data

In [5]:
X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)

In [6]:
X.shape, y.shape, y.mean()

((652, 7), (652,), 0.7285276073619632)

### Definition of the `PyTorch` classification `module`

This is just an MLP with two hidden layers.

In [7]:
class ClassifierModule(nn.Module):

    def __init__(
        self,
        num_features=7,
        num_units=1024,
        n_classes=2,
        nonlin=F.relu,
        dropout=0.1,
        depth=2,
        batchnorm=True,
    ):
        super(ClassifierModule, self).__init__()
        self.num_features = num_features
        self.num_units = num_units
        self.n_classes = n_classes
        self.nonlin = nonlin
        self.batchnorm = batchnorm
        self.depth = depth

        self.dense0 = nn.Linear(self.num_features, self.num_units)
        self.nonlin = self.nonlin
        self.dropout = nn.Dropout(dropout)

        layers = []
        for i in range(1, self.depth):
            layers.append(nn.Linear(self.num_units, self.num_units))
        self.dense1 = nn.Sequential(*layers)

        self.output = nn.Linear(self.num_units, self.n_classes)
        self.bn = nn.BatchNorm1d(self.n_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.output(X)

        if self.batchnorm:
            X = self.bn(X)

        X = F.softmax(X, dim=-1)
        return X

In [8]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)  # use gpu

device

device(type='cuda')

In [9]:
best_params = {
    "batch_size": 20,
    "max_epochs": 10,
    "module__depth": 6,
    "module__dropout": 0.4,
    "module__num_units": 66,
}

In [10]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints"
path = Path(checkpoints_dir)

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    for root, dirs, files in os.walk(checkpoints_dir):
        for file in files:
            checkpoint = f"{root}/{file}"
            print(f"deleting file: {checkpoint}")
            os.unlink(checkpoint)
        for dir in dirs:
            checkpoint = f"{root}/{dir}"
            print(f"deleting dir: {checkpoint}")
            shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)

checkpoints_dir: ./checkpoints exists
deleting dir: ./checkpoints/cp1
deleting dir: ./checkpoints/cp2
deleting dir: ./checkpoints/cp3


In [11]:
models = {}

In [12]:
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch import NeuralNetClassifier

cp = Checkpoint(dirname=f"{checkpoints_dir}/cp1")

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp] = optimal

In [13]:
%%time

optimal.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.9041[0m       [32m0.2578[0m        [35m0.7963[0m     +  0.1596
      2        [36m0.7319[0m       [32m0.2969[0m        [35m0.7785[0m     +  0.0464
      3        [36m0.7088[0m       [32m0.5312[0m        [35m0.6882[0m     +  0.0473
      4        [36m0.7076[0m       [32m0.6406[0m        [35m0.6692[0m     +  0.0497
      5        [36m0.6969[0m       [32m0.6953[0m        0.6698        0.0489
      6        [36m0.6869[0m       [32m0.7031[0m        [35m0.6613[0m     +  0.0492
      7        [36m0.6810[0m       [32m0.7188[0m        0.6674        0.0571
      8        0.6890       0.7188        0.6826        0.0445
      9        [36m0.6755[0m       0.7188        0.7006        0.0466
     10        0.6803       [32m0.7422[0m        0.6628        0.0460
CPU times: user 894 ms, sys: 137 ms, total: 1.03 s
Wall

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=7, out_features=66, bias=True)
    (dropout): Dropout(p=0.4, inplace=False)
    (dense1): Sequential(
      (0): Linear(in_features=66, out_features=66, bias=True)
      (1): Linear(in_features=66, out_features=66, bias=True)
      (2): Linear(in_features=66, out_features=66, bias=True)
      (3): Linear(in_features=66, out_features=66, bias=True)
      (4): Linear(in_features=66, out_features=66, bias=True)
    )
    (output): Linear(in_features=66, out_features=2, bias=True)
    (bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  ),
)

In [14]:
from skorch.callbacks import LoadInitState

# load_state = LoadInitState(cp)
cp2 = Checkpoint(dirname=f"{checkpoints_dir}/cp2")

optimal2 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=7,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp2],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp2] = optimal2

In [15]:
%%time

_ = optimal2.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7735[0m       [32m0.6328[0m        [35m0.6499[0m     +  0.0553
      2        0.7787       0.6328        [35m0.6485[0m     +  0.0494
      3        [36m0.7332[0m       [32m0.7266[0m        0.7040        0.0537
      4        [36m0.7080[0m       [32m0.7578[0m        0.6541        0.0517
      5        [36m0.6854[0m       0.7422        0.6716        0.0531
      6        0.6942       0.7266        0.7321        0.0490
      7        [36m0.6797[0m       0.7266        0.6976        0.0482
      8        [36m0.6742[0m       0.7422        0.6642        0.0458
      9        [36m0.6625[0m       0.7344        0.6539        0.0520
     10        0.6732       0.7266        0.7404        0.0508
CPU times: user 468 ms, sys: 82 ms, total: 550 ms
Wall time: 536 ms


In [16]:
# load_state = LoadInitState(cp2)
cp3 = Checkpoint(dirname=f"{checkpoints_dir}/cp3")

optimal3 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=80,
    module__dropout=0.4,
    device=device,
    callbacks=[cp3],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp3] = optimal3

In [17]:
%%time

_ = optimal3.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7625[0m       [32m0.7266[0m        [35m0.6981[0m     +  0.0518
      2        [36m0.6957[0m       0.7266        0.7133        0.0498
      3        [36m0.6952[0m       0.7266        0.7275        0.0495
      4        [36m0.6930[0m       0.7266        [35m0.6954[0m     +  0.0525
      5        0.6944       0.6328        0.7495        0.0535
      6        [36m0.6860[0m       0.7266        0.7094        0.0488
      7        0.6862       0.7266        [35m0.6757[0m     +  0.0495
      8        [36m0.6766[0m       [32m0.7422[0m        [35m0.6673[0m     +  0.0470
      9        [36m0.6735[0m       0.7266        0.6758        0.0498
     10        [36m0.6681[0m       0.7266        0.6845        0.0514
CPU times: user 437 ms, sys: 109 ms, total: 546 ms
Wall time: 534 ms


In [18]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

checkpoints = [cp, cp2, cp3]

best_f1 = 0
best_y_pred = None
best_net = None

for checkpoint in models:
    net = models[checkpoint]
    net.initialize()
    net.load_params(checkpoint=checkpoint)

    y_pred = net.predict(X_test)
    f_measure = f1_score(y_test, y_pred)
    if f_measure > best_f1:
        best_f1 = f_measure
        best_net = net
        best_cp = checkpoint

best_f1, best_cp.dirname

Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
CPU times: user 35.8 ms, sys: 0 ns, total: 35.8 ms
Wall time: 34.4 ms


(0.8235294117647058, './checkpoints/cp2')

In [19]:
%%time

y_pred = best_net.predict(X_test)

CPU times: user 6.95 ms, sys: 0 ns, total: 6.95 ms
Wall time: 6.24 ms


In [20]:
result = {
    "Model": ["Naïve Bayes", "SVM", "Logistic Regression", "Random Forest"],
    "Accuracy": [
        0.6463414634146342,
        0.6524390243902439,
        0.6524390243902439,
        0.7682926829268293,
    ],
    "Precision": [
        0.6540880503144654,
        0.6524390243902439,
        0.8289473684210527,
        0.7804878048780488,
    ],
    "Recall": [0.9719626168224299, 1.0, 0.5887850467289719, 0.897196261682243],
    "F1 Score": [
        0.7819548872180451,
        0.7896678966789668,
        0.6885245901639344,
        0.8347826086956521,
    ],
}

In [21]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.7621951219512195
Precision: 0.7982456140350878
Recall: 0.8504672897196262
F-measure: 0.8235294117647058


In [22]:
result

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'Neural Network'],
 'Accuracy': [0.6463414634146342,
  0.6524390243902439,
  0.6524390243902439,
  0.7682926829268293,
  0.7621951219512195],
 'Precision': [0.6540880503144654,
  0.6524390243902439,
  0.8289473684210527,
  0.7804878048780488,
  0.7982456140350878],
 'Recall': [0.9719626168224299,
  1.0,
  0.5887850467289719,
  0.897196261682243,
  0.8504672897196262],
 'F1 Score': [0.7819548872180451,
  0.7896678966789668,
  0.6885245901639344,
  0.8347826086956521,
  0.8235294117647058]}

In [23]:
from datasets import Dataset

perf_ds2 = Dataset.from_dict(result).to_pandas()
res2 = perf_ds2.set_index(perf_ds2.columns[0]).mul(100)
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,64.6%,65.4%,97.2%,78.2%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,65.2%,82.9%,58.9%,68.9%
Random Forest,76.8%,78.0%,89.7%,83.5%
Neural Network,76.2%,79.8%,85.0%,82.4%
