In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# Read the data into a dataframe
df = pd.read_csv("cleaned_sentiment_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Define the parameter grids for grid search
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5, 10],
}

lr_param_grid = {"C": [0.1, 1, 10], "penalty": ["l1", "l2"]}

nb_param_grid = {}

svm_param_grid = {
    "C": [0.1, 1, 10],
    "gamma": [1e-2, 1e-3, 1e-4, 1e-5],
    "kernel": ["rbf"],
}

catboost_param_grid = {"iterations": [1000], "learning_rate": [0.03], "depth": [10]}

# Define the models
models = {
    "Naïve Bayes": (GaussianNB(), nb_param_grid),
    "SVM": (SVC(random_state=42), svm_param_grid),
    "Logistic Regression": (LogisticRegression(random_state=42), lr_param_grid),
    "Random Forest": (RandomForestClassifier(random_state=42), rf_param_grid),
    "CatBoost": (CatBoostClassifier(), catboost_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

best_estimators = {}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    best_estimators[model_name] = grid_search.best_estimator_

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(
        grid_search.best_estimator_,
        X_resampled,
        y_resampled,
        cv=5,
        scoring=["accuracy", "precision", "recall", "f1"],
    )
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results["test_accuracy"]):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate results across non-overfitted folds
    num_folds = len(cv_results["test_accuracy"])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (
        sum(cv_results["test_accuracy"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_precision = (
        sum(cv_results["test_precision"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_recall = (
        sum(cv_results["test_recall"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_f1 = (
        sum(cv_results["test_f1"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.4842105263157895
Precision: 0.4444444444444444
Recall: 0.12631578947368421
F1 Score: 0.19672131147540983

Fold 2:
Accuracy: 0.5
Precision: 0.5
Recall: 0.9789473684210527
F1 Score: 0.6619217081850534

Fold 3:
Accuracy: 0.5105263157894737
Precision: 0.5055555555555555
Recall: 0.9578947368421052
F1 Score: 0.6618181818181819

Fold 4:
Accuracy: 0.49473684210526314
Precision: 0.4972375690607735
Recall: 0.9473684210526315
F1 Score: 0.6521739130434783

Fold 5:
Accuracy: 0.5263157894736842
Precision: 0.5159235668789809
Recall: 0.8526315789473684
F1 Score: 0.6428571428571429

Average Accuracy: 0.503157894736842
Average Precision: 0.4926322271879509
Average Recall: 0.7726315789473684
Average F1 Score: 0.5630984514758531

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8473684210526315
Precision: 0.7661290322580645
Recall: 1.0
F1 Score: 0.86757

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 0.1, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.6789473684210526
Precision: 0.7125
Recall: 0.6
F1 Score: 0.6514285714285715

Fold 2:
Accuracy: 0.6526315789473685
Precision: 0.6705882352941176
Recall: 0.6
F1 Score: 0.6333333333333333

Fold 3:
Accuracy: 0.7473684210526316
Precision: 0.7640449438202247
Recall: 0.7157894736842105
F1 Score: 0.7391304347826086

Fold 4:
Accuracy: 0.7
Precision: 0.7878787878787878
Recall: 0.5473684210526316
F1 Score: 0.6459627329192547

Fold 5:
Accuracy: 0.7368421052631579
Precision: 0.835820895522388
Recall: 0.5894736842105263
F1 Score: 0.691358024691358

Average Accuracy: 0.7031578947368421
Average Precision: 0.7541665725031036
Average Recall: 0.6105263157894737
Average F1 Score: 0.6722426194310251

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8
Precision: 0.7878787878787878
Recall: 0.8210526315789474
F1 Score: 

{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 0.1, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200},
  {'depth': 10, 'iterations': 1000, 'learning_rate': 0.03}],
 'Average Accuracy': [0.503157894736842,
  0.8684210526315788,
  0.7031578947368421,
  0.8652631578947367,
  0.8768421052631579],
 'Average Precision': [0.4926322271879509,
  0.792547274749722,
  0.7541665725031036,
  0.9175244542691351,
  0.934115368058545],
 'Average Recall': [0.7726315789473684,
  1.0,
  0.6105263157894737,
  0.8126315789473683,
  0.8210526315789475],
 'Average F1 Score': [0.5630984514758531,
  0.884026921162544,
  0.6722426194310251,
  0.85928445897126,
  0.8707019171304886]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.503158,0.492632,0.772632,0.563098
1,SVM,"{'C': 1.0, 'depth': None, 'gamma': 0.01, 'iter...",0.868421,0.792547,1.0,0.884027
2,Logistic Regression,"{'C': 0.1, 'depth': None, 'gamma': None, 'iter...",0.703158,0.754167,0.610526,0.672243
3,Random Forest,"{'C': None, 'depth': None, 'gamma': None, 'ite...",0.865263,0.917524,0.812632,0.859284
4,CatBoost,"{'C': None, 'depth': 10.0, 'gamma': None, 'ite...",0.876842,0.934115,0.821053,0.870702


In [4]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [5]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,50.315789,49.263223,77.263158,56.309845
SVM,86.842105,79.254727,100.0,88.402692
Logistic Regression,70.315789,75.416657,61.052632,67.224262
Random Forest,86.526316,91.752445,81.263158,85.928446
CatBoost,87.684211,93.411537,82.105263,87.070192


In [6]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,50.3%,49.3%,77.3%,56.3%
SVM,86.8%,79.3%,100.0%,88.4%
Logistic Regression,70.3%,75.4%,61.1%,67.2%
Random Forest,86.5%,91.8%,81.3%,85.9%
CatBoost,87.7%,93.4%,82.1%,87.1%


In [7]:
best_estimators

{'Naïve Bayes': GaussianNB(),
 'SVM': SVC(C=1, gamma=0.01, random_state=42),
 'Logistic Regression': LogisticRegression(C=0.1, random_state=42),
 'Random Forest': RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42),
 'CatBoost': <catboost.core.CatBoostClassifier at 0x17ffbfa90>}

In [8]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

result = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
}

for model_name in best_estimators:
    model = best_estimators[model_name]
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F-measure:", f_measure)

    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1 Score"].append(f_measure)

result

Model: Naïve Bayes
Accuracy: 0.6585365853658537
Precision: 0.660377358490566
Recall: 0.9813084112149533
F-measure: 0.7894736842105263
Model: SVM
Accuracy: 0.6524390243902439
Precision: 0.6524390243902439
Recall: 1.0
F-measure: 0.7896678966789668
Model: Logistic Regression
Accuracy: 0.6646341463414634
Precision: 0.8170731707317073
Recall: 0.6261682242990654
F-measure: 0.708994708994709
Model: Random Forest
Accuracy: 0.7378048780487805
Precision: 0.753968253968254
Recall: 0.8878504672897196
F-measure: 0.8154506437768241
Model: CatBoost
Accuracy: 0.7378048780487805
Precision: 0.753968253968254
Recall: 0.8878504672897196
F-measure: 0.8154506437768241
CPU times: user 23.8 ms, sys: 1.64 ms, total: 25.4 ms
Wall time: 24.4 ms


{'Model': ['Naïve Bayes',
  'SVM',
  'Logistic Regression',
  'Random Forest',
  'CatBoost'],
 'Accuracy': [0.6585365853658537,
  0.6524390243902439,
  0.6646341463414634,
  0.7378048780487805,
  0.7378048780487805],
 'Precision': [0.660377358490566,
  0.6524390243902439,
  0.8170731707317073,
  0.753968253968254,
  0.753968253968254],
 'Recall': [0.9813084112149533,
  1.0,
  0.6261682242990654,
  0.8878504672897196,
  0.8878504672897196],
 'F1 Score': [0.7894736842105263,
  0.7896678966789668,
  0.708994708994709,
  0.8154506437768241,
  0.8154506437768241]}

In [9]:
import numpy as np

X, y = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.int64)
X_test, y_test = X_test.to_numpy().astype(np.float32), y_test.to_numpy().astype(
    np.int64
)
X.shape, y.shape, y.mean()

((652, 5), (652,), 0.7285276073619632)

In [10]:
import torch
from skorch import NeuralNetClassifier
from neural_net import ClassifierModule

device = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)

device

device(type='mps')

In [11]:
# change this to True to force re-training the model
force_retrain = False

In [12]:
import os
import shutil
from pathlib import Path

checkpoints_dir = "./checkpoints/02_sentiment_data_training"
path = Path(checkpoints_dir)

training_needed = force_retrain

if path.exists():
    print(f"checkpoints_dir: {checkpoints_dir} exists")
    if force_retrain:
        for root, dirs, files in os.walk(checkpoints_dir):
            for file in files:
                checkpoint = f"{root}/{file}"
                print(f"deleting file: {checkpoint}")
                os.unlink(checkpoint)
            for dir in dirs:
                checkpoint = f"{root}/{dir}"
                print(f"deleting dir: {checkpoint}")
                shutil.rmtree(checkpoint)
else:
    print(f"checkpoints_dir: {checkpoints_dir} doesn't exist. creating it ...")
    path.parent.mkdir(parents=True, exist_ok=True)
    training_needed = True

training_needed

checkpoints_dir: ./checkpoints/02_sentiment_data_training doesn't exist. creating it ...


True

In [13]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [14]:
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch import NeuralNetClassifier

checkpoint = Checkpoint(dirname=f"{checkpoints_dir}", load_best=True)

optimal = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    lr=0.001,
    batch_size=20,
    module__depth=6,
    module__num_features=X.shape[1],
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    callbacks=[checkpoint],
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

In [15]:
%%time

if training_needed:
    _ = optimal.fit(X[:650], y[:650])

  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7473[0m       [32m0.7231[0m        [35m0.6859[0m     +  0.7298
      2        [36m0.7243[0m       0.6462        [35m0.6686[0m     +  0.1778
      3        [36m0.6987[0m       [32m0.7462[0m        [35m0.6632[0m     +  0.1835
      4        [36m0.6910[0m       0.7462        0.6672        0.1794
      5        0.6967       0.7462        0.6728        0.1867
      6        [36m0.6902[0m       0.7154        [35m0.6629[0m     +  0.1747
      7        0.6914       0.7462        [35m0.6532[0m     +  0.1750
      8        [36m0.6696[0m       0.6308        [35m0.6509[0m     +  0.1804
      9        0.6840       0.6538        0.6571        0.1747
     10        0.6716       0.6462        0.6612        0.1842
CPU times: user 1.87 s, sys: 276 ms, total: 2.15 s
Wall time: 3.24 s


In [16]:
%%time

if training_needed:
    y_pred_optimal = optimal.predict(X_test)

CPU times: user 30 ms, sys: 6.1 ms, total: 36.1 ms
Wall time: 43.7 ms


In [17]:
net = NeuralNetClassifier(
    ClassifierModule,
    module__depth=6,
    module__num_features=X.shape[1],
    module__num_units=66,
    module__dropout=0.4,
    device=device,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)
net.initialize()
net.load_params(checkpoint=checkpoint)

In [18]:
%%time

y_pred = net.predict(X_test)
(y_pred == y_pred_optimal).all() if training_needed else True

CPU times: user 37.1 ms, sys: 7.85 ms, total: 45 ms
Wall time: 52.6 ms


True

In [19]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

model_name = "Neural Network"
result["Model"].append(model_name)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

result["Accuracy"].append(accuracy)
result["Precision"].append(precision)
result["Recall"].append(recall)
result["F1 Score"].append(f_measure)

Accuracy: 0.7439024390243902
Precision: 0.76
Recall: 0.8878504672897196
F-measure: 0.8189655172413793


In [20]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()
perf_ds

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Naïve Bayes,0.658537,0.660377,0.981308,0.789474
1,SVM,0.652439,0.652439,1.0,0.789668
2,Logistic Regression,0.664634,0.817073,0.626168,0.708995
3,Random Forest,0.737805,0.753968,0.88785,0.815451
4,CatBoost,0.737805,0.753968,0.88785,0.815451
5,Neural Network,0.743902,0.76,0.88785,0.818966


In [21]:
res2 = perf_ds.set_index(perf_ds.columns[0]).mul(100)
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,65.9%,66.0%,98.1%,78.9%
SVM,65.2%,65.2%,100.0%,79.0%
Logistic Regression,66.5%,81.7%,62.6%,70.9%
Random Forest,73.8%,75.4%,88.8%,81.5%
CatBoost,73.8%,75.4%,88.8%,81.5%
Neural Network,74.4%,76.0%,88.8%,81.9%
