In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Read the data into a dataframe
df = pd.read_csv("cleaned_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_direct_mentions",
        "total_indirect_mentions",
        "total_likes",
        "total_retweets",
        "total_project_followers",
        "total_indirect_followers",
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [2]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   total_direct_mentions             816 non-null    int64
 1   total_indirect_mentions           816 non-null    int64
 2   total_likes                       816 non-null    int64
 3   total_retweets                    816 non-null    int64
 4   total_project_followers           816 non-null    int64
 5   total_indirect_followers          816 non-null    int64
 6   total_positive_direct_mentions    816 non-null    int64
 7   total_negative_direct_mentions    816 non-null    int64
 8   total_positive_indirect_mentions  816 non-null    int64
 9   total_negative_indirect_mentions  816 non-null    int64
 10  soft_cap                          816 non-null    int64
dtypes: int64(11)
memory usage: 70.3 KB


# Neural Network

To get started, we use a very simple classification problem and a very simple multi-layer perceptron architecture.

## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.datasets import make_classification
from torch import nn

from skorch import NeuralNetClassifier
from skorch.helper import SkorchDoctor

In [4]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [5]:
plt.style.use("seaborn-v0_8")

## Load data

In [6]:
X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)

In [7]:
X.shape, y.shape, y.mean()

((652, 11), (652,), 0.7285276073619632)

### Definition of the `PyTorch` classification `module`

This is just an MLP with two hidden layers.

In [8]:
class ClassifierModule(nn.Module):

    def __init__(
        self,
        num_features=11,
        num_units=1024,
        n_classes=2,
        nonlin=F.relu,
        dropout=0.1,
        depth=2,
        batchnorm=True,
    ):
        super(ClassifierModule, self).__init__()
        self.num_features = num_features
        self.num_units = num_units
        self.n_classes = n_classes
        self.nonlin = nonlin
        self.batchnorm = batchnorm
        self.depth = depth

        self.dense0 = nn.Linear(self.num_features, self.num_units)
        self.nonlin = self.nonlin
        self.dropout = nn.Dropout(dropout)

        layers = []
        for i in range(1, self.depth):
            layers.append(nn.Linear(self.num_units, self.num_units))
        self.dense1 = nn.Sequential(*layers)

        self.output = nn.Linear(self.num_units, self.n_classes)
        self.bn = nn.BatchNorm1d(self.n_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.output(X)

        if self.batchnorm:
            X = self.bn(X)

        X = F.softmax(X, dim=-1)
        return X

In [9]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)  # use gpu

device

device(type='cuda')

In [10]:
best_params = {
    "batch_size": 20,
    "max_epochs": 10,
    "module__depth": 6,
    "module__dropout": 0.4,
    "module__num_units": 66,
}

In [11]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints"
path = Path(checkpoints_dir)

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    for root, dirs, files in os.walk(checkpoints_dir):
        for file in files:
            checkpoint = f"{root}/{file}"
            print(f"deleting file: {checkpoint}")
            os.unlink(checkpoint)
        for dir in dirs:
            checkpoint = f"{root}/{dir}"
            print(f"deleting dir: {checkpoint}")
            shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)

checkpoints_dir: ./checkpoints exists
deleting dir: ./checkpoints/cp1
deleting dir: ./checkpoints/cp2
deleting dir: ./checkpoints/cp3


In [12]:
models = {}

In [13]:
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch import NeuralNetClassifier

cp = Checkpoint(dirname=f"{checkpoints_dir}/cp1")

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp] = optimal

In [14]:
%%time

optimal.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7773[0m       [32m0.5938[0m        [35m0.6842[0m     +  0.1405
      2        [36m0.7016[0m       [32m0.6172[0m        [35m0.6762[0m     +  0.0466
      3        [36m0.6943[0m       [32m0.6250[0m        0.6814        0.0486
      4        0.6954       0.4062        0.6893        0.0507
      5        [36m0.6801[0m       [32m0.6406[0m        [35m0.6761[0m     +  0.0459
      6        0.6894       [32m0.7109[0m        [35m0.6691[0m     +  0.0486
      7        0.6807       [32m0.7188[0m        [35m0.6666[0m     +  0.0455
      8        [36m0.6724[0m       [32m0.7500[0m        0.6709        0.0459
      9        [36m0.6649[0m       0.7422        [35m0.6661[0m     +  0.0467
     10        0.6723       0.7266        0.7268        0.0447
CPU times: user 778 ms, sys: 174 ms, total: 953 ms
Wall time: 993 ms


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=11, out_features=66, bias=True)
    (dropout): Dropout(p=0.4, inplace=False)
    (dense1): Sequential(
      (0): Linear(in_features=66, out_features=66, bias=True)
      (1): Linear(in_features=66, out_features=66, bias=True)
      (2): Linear(in_features=66, out_features=66, bias=True)
      (3): Linear(in_features=66, out_features=66, bias=True)
      (4): Linear(in_features=66, out_features=66, bias=True)
    )
    (output): Linear(in_features=66, out_features=2, bias=True)
    (bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  ),
)

In [15]:
from skorch.callbacks import LoadInitState

# load_state = LoadInitState(cp)
cp2 = Checkpoint(dirname=f"{checkpoints_dir}/cp2")

optimal2 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=7,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp2],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp2] = optimal2

In [16]:
%%time

_ = optimal2.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7567[0m       [32m0.7188[0m        [35m0.6974[0m     +  0.0541
      2        [36m0.7026[0m       0.7109        0.7229        0.0506
      3        [36m0.6958[0m       [32m0.7266[0m        [35m0.6961[0m     +  0.0533
      4        0.6993       [32m0.7344[0m        [35m0.6742[0m     +  0.0511
      5        [36m0.6912[0m       [32m0.7422[0m        [35m0.6618[0m     +  0.0526
      6        [36m0.6862[0m       0.7266        0.6728        0.0523
      7        [36m0.6830[0m       0.7266        0.6795        0.0459
      8        [36m0.6782[0m       0.7344        0.6704        0.0479
      9        [36m0.6749[0m       0.7266        0.6724        0.0477
     10        [36m0.6719[0m       0.7266        0.6687        0.0476
CPU times: user 507 ms, sys: 38.4 ms, total: 545 ms
Wall time: 532 ms


In [17]:
# load_state = LoadInitState(cp2)
cp3 = Checkpoint(dirname=f"{checkpoints_dir}/cp3")

optimal3 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=80,
    module__dropout=0.4,
    device=device,
    callbacks=[cp3],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp3] = optimal3

In [18]:
%%time

_ = optimal3.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7863[0m       [32m0.7500[0m        [35m0.6352[0m     +  0.0507
      2        [36m0.7045[0m       0.6484        0.6816        0.0528
      3        [36m0.6982[0m       0.4531        0.8045        0.0485
      4        [36m0.6894[0m       0.6172        0.6710        0.0465
      5        [36m0.6822[0m       0.7266        0.6908        0.0516
      6        0.6850       0.7266        0.6836        0.0521
      7        0.6838       0.7266        0.7210        0.0491
      8        [36m0.6757[0m       0.7422        0.6679        0.0463
      9        [36m0.6711[0m       0.7500        0.6626        0.0518
     10        0.6744       0.7500        0.6556        0.0502
CPU times: user 457 ms, sys: 76.8 ms, total: 534 ms
Wall time: 521 ms


In [19]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

checkpoints = [cp, cp2, cp3]

best_f1 = 0
best_y_pred = None
best_net = None

for checkpoint in models:
    net = models[checkpoint]
    net.initialize()
    net.load_params(checkpoint=checkpoint)

    y_pred = net.predict(X_test)
    f_measure = f1_score(y_test, y_pred)
    if f_measure > best_f1:
        best_f1 = f_measure
        best_net = net
        best_cp = checkpoint

best_f1, best_cp.dirname

Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
CPU times: user 35.6 ms, sys: 0 ns, total: 35.6 ms
Wall time: 34.6 ms


(0.7969348659003831, './checkpoints/cp3')

In [20]:
%%time

y_pred = best_net.predict(X_test)

CPU times: user 7.8 ms, sys: 0 ns, total: 7.8 ms
Wall time: 6.62 ms


In [21]:
result = {
    "Model": ["Naïve Bayes", "SVM", "Logistic Regression", "Random Forest"],
    "Accuracy": [
        0.6402439024390244,
        0.6524390243902439,
        0.6463414634146342,
        0.7804878048780488,
    ],
    "Precision": [
        0.6518987341772152,
        0.6524390243902439,
        0.855072463768116,
        0.7933884297520661,
    ],
    "Recall": [0.9626168224299065, 1.0, 0.5514018691588785, 0.897196261682243],
    "F1 Score": [
        0.7773584905660378,
        0.7896678966789668,
        0.6704545454545454,
        0.8421052631578947,
    ],
}

In [22]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.676829268292683
Precision: 0.6753246753246753
Recall: 0.9719626168224299
F-measure: 0.7969348659003831


In [23]:
result

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'Neural Network'],
 'Accuracy': [0.6402439024390244,
  0.6524390243902439,
  0.6463414634146342,
  0.7804878048780488,
  0.676829268292683],
 'Precision': [0.6518987341772152,
  0.6524390243902439,
  0.855072463768116,
  0.7933884297520661,
  0.6753246753246753],
 'Recall': [0.9626168224299065,
  1.0,
  0.5514018691588785,
  0.897196261682243,
  0.9719626168224299],
 'F1 Score': [0.7773584905660378,
  0.7896678966789668,
  0.6704545454545454,
  0.8421052631578947,
  0.7969348659003831]}

In [24]:
from datasets import Dataset

perf_ds2 = Dataset.from_dict(result).to_pandas()
res2 = perf_ds2.set_index(perf_ds2.columns[0]).mul(100)
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,64.0%,65.2%,96.3%,77.7%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,64.6%,85.5%,55.1%,67.0%
Random Forest,78.0%,79.3%,89.7%,84.2%
Neural Network,67.7%,67.5%,97.2%,79.7%
