In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# Read the data into a dataframe
df = pd.read_csv('cleaned_engagement_data.csv')

# Separate the features and target variable
X = df[['total_direct_mentions', 
        'total_indirect_mentions', 
        'total_likes', 
        'total_retweets', 
        'total_project_followers', 
        'total_indirect_followers', 
        'soft_cap']]
y = df['ico_success']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Define the parameter grids for grid search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10]
}

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

nb_param_grid = {}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    'kernel': ['rbf']
}

catboost_param_grid = {"iterations": [1000], "learning_rate": [0.03], "depth": [10]}

# Define the models
models = {
    "Naïve Bayes": (GaussianNB(), nb_param_grid),
    "SVM": (SVC(random_state=42), svm_param_grid),
    "Logistic Regression": (LogisticRegression(random_state=42), lr_param_grid),
    "Random Forest": (RandomForestClassifier(random_state=42), rf_param_grid),
    "CatBoost": (CatBoostClassifier(), catboost_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

best_estimators = {}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    best_estimators[model_name] = grid_search.best_estimator_

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.45263157894736844
Precision: 0.3333333333333333
Recall: 0.09473684210526316
F1 Score: 0.14754098360655737

Fold 2:
Accuracy: 0.5
Precision: 0.5
Recall: 0.968421052631579
F1 Score: 0.6594982078853047

Fold 3:
Accuracy: 0.5
Precision: 0.5
Recall: 0.9473684210526315
F1 Score: 0.6545454545454545

Fold 4:
Accuracy: 0.4789473684210526
Precision: 0.48863636363636365
Recall: 0.9052631578947369
F1 Score: 0.6346863468634686

Fold 5:
Accuracy: 0.49473684210526314
Precision: 0.4968553459119497
Recall: 0.8315789473684211
F1 Score: 0.6220472440944882

Average Accuracy: 0.4852631578947369
Average Precision: 0.4637650085763293
Average Recall: 0.7494736842105263
Average F1 Score: 0.5436636473990546

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8473684210526315
Precision: 0.7661290322580645
Recall: 1.0
F1 Score: 0.867579908675799

Fold 2:
Accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 10, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.6473684210526316
Precision: 0.6521739130434783
Recall: 0.631578947368421
F1 Score: 0.6417112299465241

Fold 2:
Accuracy: 0.6789473684210526
Precision: 0.717948717948718
Recall: 0.5894736842105263
F1 Score: 0.6473988439306358

Fold 3:
Accuracy: 0.6947368421052632
Precision: 0.7466666666666667
Recall: 0.5894736842105263
F1 Score: 0.6588235294117647

Fold 4:
Accuracy: 0.6157894736842106
Precision: 0.6486486486486487
Recall: 0.5052631578947369
F1 Score: 0.5680473372781065

Fold 5:
Accuracy: 0.6578947368421053
Precision: 0.7083333333333334
Recall: 0.5368421052631579
F1 Score: 0.6107784431137725

Average Accuracy: 0.6589473684210526
Average Precision: 0.694754255928169
Average Recall: 0.5705263157894737
Average F1 Score: 0.6253518767361608

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Cross-Validation Results:
Fold 1:
Accuracy: 0.85789473684210

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 10, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200},
  {'depth': 10, 'iterations': 1000, 'learning_rate': 0.03}],
 'Average Accuracy': [0.4852631578947369,
  0.8684210526315788,
  0.6589473684210526,
  0.8989473684210527,
  0.9168421052631579],
 'Average Precision': [0.4637650085763293,
  0.792547274749722,
  0.694754255928169,
  0.9278313459549296,
  0.9494078736797184],
 'Average Recall': [0.7494736842105263,
  1.0,
  0.5705263157894737,
  0.8694736842105264,
  0.8863157894736842],
 'Average F1 Score': [0.5436636473990546,
  0.884026921162544,
  0.6253518767361608,
  0.8962616835231767,
  0.9147852483186316]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.485263,0.463765,0.749474,0.543664
1,SVM,"{'C': 1.0, 'depth': None, 'gamma': 0.01, 'iter...",0.868421,0.792547,1.0,0.884027
2,Logistic Regression,"{'C': 10.0, 'depth': None, 'gamma': None, 'ite...",0.658947,0.694754,0.570526,0.625352
3,Random Forest,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.898947,0.927831,0.869474,0.896262
4,CatBoost,"{'C': None, 'depth': 10.0, 'gamma': None, 'ite...",0.916842,0.949408,0.886316,0.914785


In [4]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [5]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,48.526316,46.376501,74.947368,54.366365
SVM,86.842105,79.254727,100.0,88.402692
Logistic Regression,65.894737,69.475426,57.052632,62.535188
Random Forest,89.894737,92.783135,86.947368,89.626168
CatBoost,91.684211,94.940787,88.631579,91.478525


In [6]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,48.5%,46.4%,74.9%,54.4%
SVM,86.8%,79.3%,100.0%,88.4%
Logistic Regression,65.9%,69.5%,57.1%,62.5%
Random Forest,89.9%,92.8%,86.9%,89.6%
CatBoost,91.7%,94.9%,88.6%,91.5%


In [7]:
best_estimators

{'Naïve Bayes': GaussianNB(),
 'SVM': SVC(C=1, gamma=0.01, random_state=42),
 'Logistic Regression': LogisticRegression(C=10, random_state=42),
 'Random Forest': RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42),
 'CatBoost': <catboost.core.CatBoostClassifier at 0x16e49f050>}

In [8]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

result = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
}

for model_name in best_estimators:
    model = best_estimators[model_name]
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F-measure:", f_measure)

    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1 Score"].append(f_measure)

result

Model: Naïve Bayes
Accuracy: 0.6463414634146342
Precision: 0.6540880503144654
Recall: 0.9719626168224299
F-measure: 0.7819548872180451
Model: SVM
Accuracy: 0.6524390243902439
Precision: 0.6524390243902439
Recall: 1.0
F-measure: 0.7896678966789668
Model: Logistic Regression
Accuracy: 0.6524390243902439
Precision: 0.8289473684210527
Recall: 0.5887850467289719
F-measure: 0.6885245901639344
Model: Random Forest
Accuracy: 0.7682926829268293
Precision: 0.7804878048780488
Recall: 0.897196261682243
F-measure: 0.8347826086956521
Model: CatBoost
Accuracy: 0.7621951219512195
Precision: 0.765625
Recall: 0.9158878504672897
F-measure: 0.8340425531914893
CPU times: user 22.6 ms, sys: 1.62 ms, total: 24.2 ms
Wall time: 23.1 ms


{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Accuracy': [0.6463414634146342,
  0.6524390243902439,
  0.6524390243902439,
  0.7682926829268293,
  0.7621951219512195],
 'Precision': [0.6540880503144654,
  0.6524390243902439,
  0.8289473684210527,
  0.7804878048780488,
  0.765625],
 'Recall': [0.9719626168224299,
  1.0,
  0.5887850467289719,
  0.897196261682243,
  0.9158878504672897],
 'F1 Score': [0.7819548872180451,
  0.7896678966789668,
  0.6885245901639344,
  0.8347826086956521,
  0.8340425531914893]}

In [9]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from skorch import NeuralNetClassifier


class ClassifierModule(nn.Module):

    def __init__(
        self,
        num_features=7,
        num_units=1024,
        n_classes=2,
        nonlin=F.relu,
        dropout=0.1,
        depth=2,
        batchnorm=True,
    ):
        super(ClassifierModule, self).__init__()
        self.num_features = num_features
        self.num_units = num_units
        self.n_classes = n_classes
        self.nonlin = nonlin
        self.batchnorm = batchnorm
        self.depth = depth

        self.dense0 = nn.Linear(self.num_features, self.num_units)
        self.nonlin = self.nonlin
        self.dropout = nn.Dropout(dropout)

        layers = []
        for i in range(1, self.depth):
            layers.append(nn.Linear(self.num_units, self.num_units))
        self.dense1 = nn.Sequential(*layers)

        self.output = nn.Linear(self.num_units, self.n_classes)
        self.bn = nn.BatchNorm1d(self.n_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.output(X)

        if self.batchnorm:
            X = self.bn(X)

        X = F.softmax(X, dim=-1)
        return X

In [10]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [11]:
device = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)

device

device(type='mps')

In [12]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints/table_6"
path = Path(checkpoints_dir)

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    for root, dirs, files in os.walk(checkpoints_dir):
        for file in files:
            checkpoint = f"{root}/{file}"
            print(f"deleting file: {checkpoint}")
            os.unlink(checkpoint)
        for dir in dirs:
            checkpoint = f"{root}/{dir}"
            print(f"deleting dir: {checkpoint}")
            shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)

checkpoints_dir: ./checkpoints/table_6 exists
deleting file: ./checkpoints/table_6/optimizer.pt
deleting file: ./checkpoints/table_6/criterion.pt
deleting file: ./checkpoints/table_6/history.json
deleting file: ./checkpoints/table_6/params.pt


In [13]:
X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)
X.shape, y.shape, y.mean()

((652, 7), (652,), 0.7285276073619632)

In [14]:
from skorch.callbacks import Checkpoint

checkpoint = Checkpoint(dirname=f"{checkpoints_dir}", load_best=True)

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=7,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[checkpoint],
    
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

In [15]:
%%time

_ = optimal.fit(X[:640], y[:640])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7712[0m       [32m0.7188[0m        [35m0.7862[0m     +  0.5853
      2        [36m0.7039[0m       0.3984        [35m0.7470[0m     +  0.1830
      3        [36m0.7000[0m       0.6172        [35m0.6783[0m     +  0.1929
      4        0.7037       0.6641        [35m0.6696[0m     +  0.1803
      5        [36m0.6947[0m       [32m0.7500[0m        0.6702        0.1741
      6        [36m0.6902[0m       0.7344        0.6734        0.1788
      7        [36m0.6886[0m       0.7422        [35m0.6694[0m     +  0.1825
      8        [36m0.6773[0m       0.7500        [35m0.6636[0m     +  0.1804
      9        [36m0.6707[0m       0.7422        0.6664        0.1762
     10        0.6822       0.7266        [35m0.6616[0m     +  0.1765
CPU times: user 2.15 s, sys: 328 ms, total: 2.48 s
Wall time: 5.46 s


In [16]:
%%time

y_pred = optimal.predict(X_test)

CPU times: user 29.7 ms, sys: 5.79 ms, total: 35.5 ms
Wall time: 43.9 ms


In [17]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=7,
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[checkpoint],

    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)
net.initialize()
net.load_params(checkpoint=checkpoint)

y_pred2 = net.predict(X_test)
(y_pred == y_pred2).all()

True

In [18]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.6646341463414634
Precision: 0.678082191780822
Recall: 0.9252336448598131
F-measure: 0.782608695652174


In [19]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()
perf_ds

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Naïve Bayes,0.646341,0.654088,0.971963,0.781955
1,SVM,0.652439,0.652439,1.0,0.789668
2,Logistic Regression,0.652439,0.828947,0.588785,0.688525
3,Random Forest,0.768293,0.780488,0.897196,0.834783
4,CatBoost,0.762195,0.765625,0.915888,0.834043
5,Neural Network,0.664634,0.678082,0.925234,0.782609


In [20]:
res2 = perf_ds.set_index(perf_ds.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,64.634146,65.408805,97.196262,78.195489
SVM,65.243902,65.243902,100.0,78.96679
Logistic Regression,65.243902,82.894737,58.878505,68.852459
Random Forest,76.829268,78.04878,89.719626,83.478261
CatBoost,76.219512,76.5625,91.588785,83.404255
Neural Network,66.463415,67.808219,92.523364,78.26087


In [21]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,64.6%,65.4%,97.2%,78.2%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,65.2%,82.9%,58.9%,68.9%
Random Forest,76.8%,78.0%,89.7%,83.5%
CatBoost,76.2%,76.6%,91.6%,83.4%
Neural Network,66.5%,67.8%,92.5%,78.3%
