In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# Read the data into a dataframe
df = pd.read_csv("cleaned_combined_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_direct_mentions",
        "total_indirect_mentions",
        "total_likes",
        "total_retweets",
        "total_project_followers",
        "total_indirect_followers",
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Define the parameter grids for grid search
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5, 10],
}

lr_param_grid = {"C": [0.1, 1, 10], "penalty": ["l1", "l2"]}

nb_param_grid = {}

svm_param_grid = {
    "C": [0.1, 1, 10],
    "gamma": [1e-2, 1e-3, 1e-4, 1e-5],
    "kernel": ["rbf"],
}

catboost_param_grid = {"iterations": [1000], "learning_rate": [0.03], "depth": [10]}

# Define the models
models = {
    "Naïve Bayes": (GaussianNB(), nb_param_grid),
    "SVM": (SVC(random_state=42), svm_param_grid),
    "Logistic Regression": (LogisticRegression(random_state=42), lr_param_grid),
    "Random Forest": (RandomForestClassifier(random_state=42), rf_param_grid),
    "CatBoost": (CatBoostClassifier(), catboost_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

best_estimators = {}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    best_estimators[model_name] = grid_search.best_estimator_

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(
        grid_search.best_estimator_,
        X_resampled,
        y_resampled,
        cv=5,
        scoring=["accuracy", "precision", "recall", "f1"],
    )
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results["test_accuracy"]):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results["test_accuracy"])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (
        sum(cv_results["test_accuracy"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_precision = (
        sum(cv_results["test_precision"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_recall = (
        sum(cv_results["test_recall"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_f1 = (
        sum(cv_results["test_f1"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.45263157894736844
Precision: 0.3333333333333333
Recall: 0.09473684210526316
F1 Score: 0.14754098360655737

Fold 2:
Accuracy: 0.5
Precision: 0.5
Recall: 0.968421052631579
F1 Score: 0.6594982078853047

Fold 3:
Accuracy: 0.5105263157894737
Precision: 0.5055555555555555
Recall: 0.9578947368421052
F1 Score: 0.6618181818181819

Fold 4:
Accuracy: 0.4789473684210526
Precision: 0.4887640449438202
Recall: 0.9157894736842105
F1 Score: 0.6373626373626373

Fold 5:
Accuracy: 0.5105263157894737
Precision: 0.5061728395061729
Recall: 0.8631578947368421
F1 Score: 0.6381322957198443

Average Accuracy: 0.4905263157894737
Average Precision: 0.46676515466777635
Average Recall: 0.76
Average F1 Score: 0.548870461278505

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8473684210526315
Precision: 0.7661290322580645
Recall: 1.0
F1 Score: 0.867579908675799

Fo

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 0.1, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.7105263157894737
Precision: 0.7127659574468085
Recall: 0.7052631578947368
F1 Score: 0.708994708994709

Fold 2:
Accuracy: 0.6894736842105263
Precision: 0.7647058823529411
Recall: 0.5473684210526316
F1 Score: 0.6380368098159509

Fold 3:
Accuracy: 0.7263157894736842
Precision: 0.7792207792207793
Recall: 0.631578947368421
F1 Score: 0.6976744186046512

Fold 4:
Accuracy: 0.6210526315789474
Precision: 0.676923076923077
Recall: 0.4631578947368421
F1 Score: 0.55

Fold 5:
Accuracy: 0.6789473684210526
Precision: 0.7575757575757576
Recall: 0.5263157894736842
F1 Score: 0.6211180124223602

Average Accuracy: 0.6852631578947369
Average Precision: 0.7382382907038728
Average Recall: 0.5747368421052631
Average F1 Score: 0.6431647899675342

Model: Random Forest


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8368421052631579
Precision: 0.8076923076923077
Recall: 0.8842105263157894
F1 Score: 0.8442211055276382

Fold 2:
Accuracy: 0.8578947368421053
Precision: 0.8953488372093024
Recall: 0.8105263157894737
F1 Score: 0.850828729281768

Fold 3:
Accuracy: 0.9210526315789473
Precision: 1.0
Recall: 0.8421052631578947
F1 Score: 0.9142857142857143

Fold 4:
Accuracy: 0.9368421052631579
Precision: 0.9770114942528736
Recall: 0.8947368421052632
F1 Score: 0.9340659340659341

Fold 5:
Accuracy: 0.9368421052631579
Precision: 1.0
Recall: 0.8736842105263158
F1 Score: 0.9325842696629213

Average Accuracy: 0.8978947368421052
Average Precision: 0.9360105278308968
Average Recall: 0.8610526315789475
Average F1 Score: 0.8951971505647952

Model: CatBoost
0:	learn: 0.6800058	total: 93.7ms	remaining: 1m 33s
1:	learn: 0.6660809	total: 106ms	remaining: 52.8s
2:	learn: 0.6525236	total: 112ms	rema

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 0.1, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200},
  {'depth': 10, 'iterations': 1000, 'learning_rate': 0.03}],
 'Average Accuracy': [0.4905263157894737,
  0.8684210526315788,
  0.6852631578947369,
  0.8978947368421052,
  0.9105263157894736],
 'Average Precision': [0.46676515466777635,
  0.792547274749722,
  0.7382382907038728,
  0.9360105278308968,
  0.9410805300713557],
 'Average Recall': [0.76,
  1.0,
  0.5747368421052631,
  0.8610526315789475,
  0.8842105263157896],
 'Average F1 Score': [0.548870461278505,
  0.884026921162544,
  0.6431647899675342,
  0.8951971505647952,
  0.90908708709899]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.490526,0.466765,0.76,0.54887
1,SVM,"{'C': 1.0, 'depth': None, 'gamma': 0.01, 'iter...",0.868421,0.792547,1.0,0.884027
2,Logistic Regression,"{'C': 0.1, 'depth': None, 'gamma': None, 'iter...",0.685263,0.738238,0.574737,0.643165
3,Random Forest,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.897895,0.936011,0.861053,0.895197
4,CatBoost,"{'C': None, 'depth': 10.0, 'gamma': None, 'ite...",0.910526,0.941081,0.884211,0.909087


In [4]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [5]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,49.052632,46.676515,76.0,54.887046
SVM,86.842105,79.254727,100.0,88.402692
Logistic Regression,68.526316,73.823829,57.473684,64.316479
Random Forest,89.789474,93.601053,86.105263,89.519715
CatBoost,91.052632,94.108053,88.421053,90.908709


In [6]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,49.1%,46.7%,76.0%,54.9%
SVM,86.8%,79.3%,100.0%,88.4%
Logistic Regression,68.5%,73.8%,57.5%,64.3%
Random Forest,89.8%,93.6%,86.1%,89.5%
CatBoost,91.1%,94.1%,88.4%,90.9%


In [7]:
best_estimators

{'Naïve Bayes': GaussianNB(),
 'SVM': SVC(C=1, gamma=0.01, random_state=42),
 'Logistic Regression': LogisticRegression(C=0.1, random_state=42),
 'Random Forest': RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42),
 'CatBoost': <catboost.core.CatBoostClassifier at 0x7fb9466a2290>}

In [8]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

result = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
}

for model_name in best_estimators:
    model = best_estimators[model_name]
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F-measure:", f_measure)

    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1 Score"].append(f_measure)

result

Model: Naïve Bayes
Accuracy: 0.6402439024390244
Precision: 0.6518987341772152
Recall: 0.9626168224299065
F-measure: 0.7773584905660378
Model: SVM
Accuracy: 0.6524390243902439
Precision: 0.6524390243902439
Recall: 1.0
F-measure: 0.7896678966789668
Model: Logistic Regression
Accuracy: 0.7195121951219512
Precision: 0.9178082191780822
Recall: 0.6261682242990654
F-measure: 0.7444444444444445
Model: Random Forest
Accuracy: 0.7804878048780488
Precision: 0.7933884297520661
Recall: 0.897196261682243
F-measure: 0.8421052631578947
Model: CatBoost
Accuracy: 0.7804878048780488
Precision: 0.7709923664122137
Recall: 0.9439252336448598
F-measure: 0.8487394957983193
CPU times: user 48.3 ms, sys: 0 ns, total: 48.3 ms
Wall time: 43.8 ms


{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Accuracy': [0.6402439024390244,
  0.6524390243902439,
  0.7195121951219512,
  0.7804878048780488,
  0.7804878048780488],
 'Precision': [0.6518987341772152,
  0.6524390243902439,
  0.9178082191780822,
  0.7933884297520661,
  0.7709923664122137],
 'Recall': [0.9626168224299065,
  1.0,
  0.6261682242990654,
  0.897196261682243,
  0.9439252336448598],
 'F1 Score': [0.7773584905660378,
  0.7896678966789668,
  0.7444444444444445,
  0.8421052631578947,
  0.8487394957983193]}

In [9]:
import numpy as np

X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)
X.shape, y.shape, y.mean()

((652, 11), (652,), 0.7285276073619632)

In [10]:
import torch
from skorch import NeuralNetClassifier
from neural_net import ClassifierModule

device = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)

device

device(type='cuda')

In [11]:
# change this to True to force re-training the model
force_retrain = False

In [12]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints/04a_combined_data_training"
path = Path(checkpoints_dir)

training_needed = force_retrain

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    if force_retrain:
        for root, dirs, files in os.walk(checkpoints_dir):
            for file in files:
                checkpoint = f"{root}/{file}"
                print(f"deleting file: {checkpoint}")
                os.unlink(checkpoint)
            for dir in dirs:
                checkpoint = f"{root}/{dir}"
                print(f"deleting dir: {checkpoint}")
                shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)
    training_needed = True

training_needed

checkpoints_dir: ./checkpoints/04a_combined_data_training doesn't exist. creating it ...


True

In [13]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [14]:
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch import NeuralNetClassifier

checkpoint = Checkpoint(dirname=f"{checkpoints_dir}", load_best=True)

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=50,
    lr=0.001,
    batch_size=40,
    module__depth=7,
    module__num_features=X.shape[1],
    module__num_units=85,
    module__dropout=0.0,
    device=device,
    callbacks=[checkpoint],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

In [15]:
%%time

if training_needed:
    _ = optimal.fit(X[:650], y[:650])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.6961[0m       [32m0.3923[0m        [35m0.6939[0m     +  0.1269
      2        0.7097       [32m0.6538[0m        [35m0.6740[0m     +  0.0232
      3        0.7139       [32m0.7231[0m        0.6851        0.0242
      4        [36m0.6888[0m       0.7154        0.6827        0.0280
      5        [36m0.6858[0m       0.7000        0.6801        0.0288
      6        [36m0.6852[0m       [32m0.7538[0m        0.6831        0.0244
      7        [36m0.6837[0m       0.7385        0.6799        0.0298
      8        [36m0.6823[0m       0.7308        0.6771        0.0283
      9        [36m0.6814[0m       0.7538        0.6783        0.0277
     10        [36m0.6798[0m       0.7308        0.6793        0.0271
     11        [36m0.6783[0m       0.7385        0.6772        0.0286
     12        [36m0.6752[0m       0.7538   

In [16]:
%%time

if training_needed:
    y_pred_optimal = optimal.predict(X_test)

CPU times: user 7.01 ms, sys: 0 ns, total: 7.01 ms
Wall time: 5.76 ms


In [17]:
net = NeuralNetClassifier(
    ClassifierModule,
    module__depth=7,
    module__num_features=X.shape[1],
    module__num_units=85,
    module__dropout=0.0,
    device=device,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)
net.initialize()
net.load_params(checkpoint=checkpoint)

In [18]:
%%time

y_pred = net.predict(X_test)
(y_pred == y_pred_optimal).all() if training_needed else True

CPU times: user 6.12 ms, sys: 0 ns, total: 6.12 ms
Wall time: 5.48 ms


True

In [19]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.6707317073170732
Precision: 0.6645962732919255
Recall: 1.0
F-measure: 0.7985074626865671


In [20]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()
perf_ds

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Naïve Bayes,0.640244,0.651899,0.962617,0.777358
1,SVM,0.652439,0.652439,1.0,0.789668
2,Logistic Regression,0.719512,0.917808,0.626168,0.744444
3,Random Forest,0.780488,0.793388,0.897196,0.842105
4,CatBoost,0.780488,0.770992,0.943925,0.848739
5,Neural Network,0.670732,0.664596,1.0,0.798507


In [21]:
res2 = perf_ds.set_index(perf_ds.columns[0]).mul(100)
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,64.0%,65.2%,96.3%,77.7%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,72.0%,91.8%,62.6%,74.4%
Random Forest,78.0%,79.3%,89.7%,84.2%
CatBoost,78.0%,77.1%,94.4%,84.9%
Neural Network,67.1%,66.5%,100.0%,79.9%
