In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Read the data into a dataframe
df = pd.read_csv("cleaned_sentiment_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [2]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   total_positive_direct_mentions    816 non-null    int64  
 1   total_negative_direct_mentions    816 non-null    int64  
 2   total_positive_indirect_mentions  816 non-null    int64  
 3   total_negative_indirect_mentions  816 non-null    int64  
 4   soft_cap                          816 non-null    float64
dtypes: float64(1), int64(4)
memory usage: 32.0 KB


# Neural Network

To get started, we use a very simple classification problem and a very simple multi-layer perceptron architecture.

## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.datasets import make_classification
from torch import nn

from skorch import NeuralNetClassifier
from skorch.helper import SkorchDoctor

In [4]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [5]:
plt.style.use("seaborn-v0_8")

## Load data

In [6]:
X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)

In [7]:
X.shape, y.shape, y.mean()

((652, 5), (652,), 0.7285276073619632)

### Definition of the `PyTorch` classification `module`

This is just an MLP with two hidden layers.

In [8]:
class ClassifierModule(nn.Module):

    def __init__(
        self,
        num_features=5,
        num_units=1024,
        n_classes=2,
        nonlin=F.relu,
        dropout=0.1,
        depth=2,
        batchnorm=True,
    ):
        super(ClassifierModule, self).__init__()
        self.num_features = num_features
        self.num_units = num_units
        self.n_classes = n_classes
        self.nonlin = nonlin
        self.batchnorm = batchnorm
        self.depth = depth

        self.dense0 = nn.Linear(self.num_features, self.num_units)
        self.nonlin = self.nonlin
        self.dropout = nn.Dropout(dropout)

        layers = []
        for i in range(1, self.depth):
            layers.append(nn.Linear(self.num_units, self.num_units))
        self.dense1 = nn.Sequential(*layers)

        self.output = nn.Linear(self.num_units, self.n_classes)
        self.bn = nn.BatchNorm1d(self.n_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.output(X)

        if self.batchnorm:
            X = self.bn(X)

        X = F.softmax(X, dim=-1)
        return X

In [9]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)  # use gpu

device

device(type='cuda')

In [10]:
best_params = {
    "batch_size": 20,
    "max_epochs": 10,
    "module__depth": 6,
    "module__dropout": 0.4,
    "module__num_units": 66,
}

In [11]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints"
path = Path(checkpoints_dir)

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    for root, dirs, files in os.walk(checkpoints_dir):
        for file in files:
            checkpoint = f"{root}/{file}"
            print(f"deleting file: {checkpoint}")
            os.unlink(checkpoint)
        for dir in dirs:
            checkpoint = f"{root}/{dir}"
            print(f"deleting dir: {checkpoint}")
            shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)

checkpoints_dir: ./checkpoints exists
deleting dir: ./checkpoints/cp1
deleting dir: ./checkpoints/cp2
deleting dir: ./checkpoints/cp3


In [12]:
models = {}

In [13]:
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch import NeuralNetClassifier

cp = Checkpoint(dirname=f"{checkpoints_dir}/cp1")

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp] = optimal

In [14]:
%%time

optimal.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7821[0m       [32m0.4141[0m        [35m0.7576[0m     +  0.1524
      2        [36m0.7059[0m       [32m0.7578[0m        [35m0.6522[0m     +  0.0509
      3        0.7086       0.7031        0.7766        0.0555
      4        [36m0.6942[0m       0.7266        0.7259        0.0515
      5        [36m0.6812[0m       0.7266        0.6882        0.0493
      6        0.6875       0.7266        0.6895        0.0452
      7        [36m0.6811[0m       0.7266        0.6820        0.0461
      8        [36m0.6751[0m       0.7266        0.6691        0.0467
      9        [36m0.6740[0m       0.7266        0.6853        0.0430
     10        [36m0.6727[0m       0.7266        0.6829        0.0423
CPU times: user 865 ms, sys: 111 ms, total: 976 ms
Wall time: 1.03 s


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=5, out_features=66, bias=True)
    (dropout): Dropout(p=0.4, inplace=False)
    (dense1): Sequential(
      (0): Linear(in_features=66, out_features=66, bias=True)
      (1): Linear(in_features=66, out_features=66, bias=True)
      (2): Linear(in_features=66, out_features=66, bias=True)
      (3): Linear(in_features=66, out_features=66, bias=True)
      (4): Linear(in_features=66, out_features=66, bias=True)
    )
    (output): Linear(in_features=66, out_features=2, bias=True)
    (bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  ),
)

In [15]:
from skorch.callbacks import LoadInitState

# load_state = LoadInitState(cp)
cp2 = Checkpoint(dirname=f"{checkpoints_dir}/cp2")

optimal2 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=11,
    lr=0.001,
    batch_size=10,
    module__depth=7,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[cp2],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp2] = optimal2

In [16]:
%%time

_ = optimal2.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7052[0m       [32m0.7266[0m        [35m0.7569[0m     +  0.1045
      2        [36m0.7018[0m       0.7266        [35m0.7114[0m     +  0.0928
      3        [36m0.6860[0m       0.7266        [35m0.6895[0m     +  0.0870
      4        [36m0.6758[0m       0.7266        [35m0.6745[0m     +  0.0860
      5        [36m0.6729[0m       0.7266        0.7048        0.0945
      6        [36m0.6655[0m       0.7266        0.6869        0.0983
      7        [36m0.6626[0m       0.7266        [35m0.6596[0m     +  0.1055
      8        [36m0.6584[0m       0.7266        0.6844        0.1019
      9        [36m0.6537[0m       0.7266        0.6647        0.0926
     10        [36m0.6502[0m       [32m0.7344[0m        [35m0.6344[0m     +  0.0865
     11        [36m0.6470[0m       0.7344        0.6369        0.0934
CPU times:

In [17]:
# load_state = LoadInitState(cp2)
cp3 = Checkpoint(dirname=f"{checkpoints_dir}/cp3")

optimal3 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_units=80,
    module__dropout=0.4,
    device=device,
    callbacks=[cp3],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

models[cp3] = optimal3

In [18]:
%%time

_ = optimal3.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.8001[0m       [32m0.2734[0m        [35m0.6922[0m     +  0.0554
      2        [36m0.7031[0m       [32m0.6172[0m        [35m0.6740[0m     +  0.0482
      3        0.7090       [32m0.6328[0m        0.6757        0.0499
      4        [36m0.6982[0m       [32m0.7422[0m        0.6748        0.0458
      5        [36m0.6886[0m       [32m0.7500[0m        [35m0.6731[0m     +  0.0511
      6        [36m0.6849[0m       0.7344        [35m0.6683[0m     +  0.0461
      7        [36m0.6813[0m       0.7422        [35m0.6669[0m     +  0.0496
      8        0.6861       0.7266        0.7003        0.0447
      9        [36m0.6710[0m       0.7422        [35m0.6663[0m     +  0.0450
     10        0.6713       0.7422        [35m0.6600[0m     +  0.0460
CPU times: user 468 ms, sys: 66.4 ms, total: 534 ms
Wall time: 521 ms


In [19]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

checkpoints = [cp, cp2, cp3]

best_f1 = 0
best_y_pred = None
best_net = None

for checkpoint in models:
    net = models[checkpoint]
    net.initialize()
    net.load_params(checkpoint=checkpoint)

    y_pred = net.predict(X_test)
    f_measure = f1_score(y_test, y_pred)
    if f_measure > best_f1:
        best_f1 = f_measure
        best_net = net
        best_cp = checkpoint

best_f1, best_cp.dirname

Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
Re-initializing module because the following parameters were re-set: depth, dropout, num_units.
Re-initializing criterion.
Re-initializing optimizer.
CPU times: user 41.2 ms, sys: 865 µs, total: 42.1 ms
Wall time: 40.8 ms


(0.7985074626865671, './checkpoints/cp1')

In [20]:
%%time

y_pred = best_net.predict(X_test)

CPU times: user 4.52 ms, sys: 3.93 ms, total: 8.45 ms
Wall time: 7.42 ms


In [21]:
result = {
    "Model": ["Naïve Bayes", "SVM", "Logistic Regression", "Random Forest"],
    "Accuracy": [
        0.6585365853658537,
        0.6524390243902439,
        0.6646341463414634,
        0.7378048780487805,
    ],
    "Precision": [
        0.660377358490566,
        0.6524390243902439,
        0.8170731707317073,
        0.753968253968254,
    ],
    "Recall": [0.9813084112149533, 1.0, 0.6261682242990654, 0.8878504672897196],
    "F1 Score": [
        0.7894736842105263,
        0.7896678966789668,
        0.708994708994709,
        0.8154506437768241,
    ],
}

In [22]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.6707317073170732
Precision: 0.6645962732919255
Recall: 1.0
F-measure: 0.7985074626865671


In [23]:
result

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'Neural Network'],
 'Accuracy': [0.6585365853658537,
  0.6524390243902439,
  0.6646341463414634,
  0.7378048780487805,
  0.6707317073170732],
 'Precision': [0.660377358490566,
  0.6524390243902439,
  0.8170731707317073,
  0.753968253968254,
  0.6645962732919255],
 'Recall': [0.9813084112149533,
  1.0,
  0.6261682242990654,
  0.8878504672897196,
  1.0],
 'F1 Score': [0.7894736842105263,
  0.7896678966789668,
  0.708994708994709,
  0.8154506437768241,
  0.7985074626865671]}

In [24]:
from datasets import Dataset

perf_ds2 = Dataset.from_dict(result).to_pandas()
res2 = perf_ds2.set_index(perf_ds2.columns[0]).mul(100)
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,65.9%,66.0%,98.1%,78.9%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,66.5%,81.7%,62.6%,70.9%
Random Forest,73.8%,75.4%,88.8%,81.5%
Neural Network,67.1%,66.5%,100.0%,79.9%
