In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# Read the data into a dataframe
df = pd.read_csv("cleaned_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_direct_mentions",
        "total_indirect_mentions",
        "total_likes",
        "total_retweets",
        "total_project_followers",
        "total_indirect_followers",
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Define the parameter grids for grid search
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5, 10],
}

lr_param_grid = {"C": [0.1, 1, 10], "penalty": ["l1", "l2"]}

nb_param_grid = {}

svm_param_grid = {
    "C": [0.1, 1, 10],
    "gamma": [1e-2, 1e-3, 1e-4, 1e-5],
    "kernel": ["rbf"],
}

# Define the models
models = {
    "Naïve Bayes": (GaussianNB(), nb_param_grid),
    "SVM": (SVC(random_state=42), svm_param_grid),
    "Logistic Regression": (LogisticRegression(random_state=42), lr_param_grid),
    "Random Forest": (RandomForestClassifier(random_state=42), rf_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(
        grid_search.best_estimator_,
        X_resampled,
        y_resampled,
        cv=5,
        scoring=["accuracy", "precision", "recall", "f1"],
    )
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results["test_accuracy"]):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results["test_accuracy"])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (
        sum(cv_results["test_accuracy"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_precision = (
        sum(cv_results["test_precision"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_recall = (
        sum(cv_results["test_recall"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds
    avg_f1 = (
        sum(cv_results["test_f1"]) - (overfitted_folds * 1)
    ) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.5278969957081545
Precision: 0.5660377358490566
Recall: 0.2564102564102564
F1 Score: 0.35294117647058826

Fold 2:
Accuracy: 0.51931330472103
Precision: 0.5111111111111111
Recall: 0.9829059829059829
F1 Score: 0.672514619883041

Fold 3:
Accuracy: 0.5236051502145923
Precision: 0.5117370892018779
Recall: 0.9396551724137931
F1 Score: 0.662613981762918

Fold 4:
Accuracy: 0.47639484978540775
Precision: 0.485
Recall: 0.8362068965517241
F1 Score: 0.6139240506329114

Fold 5:
Accuracy: 0.5172413793103449
Precision: 0.5088495575221239
Recall: 0.9913793103448276
F1 Score: 0.672514619883041

Average Accuracy: 0.5128903359479058
Average Precision: 0.5165470987368339
Average Recall: 0.8013115237253169
Average F1 Score: 0.5949016897264999

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8497854077253219
Precision: 0.7697368421052632
Recall: 1.0
F1 Sc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 0.1, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.7124463519313304
Precision: 0.7551020408163265
Recall: 0.6324786324786325
F1 Score: 0.6883720930232559

Fold 2:
Accuracy: 0.7424892703862661
Precision: 0.7663551401869159
Recall: 0.7008547008547008
F1 Score: 0.7321428571428571

Fold 3:
Accuracy: 0.6995708154506438
Precision: 0.75
Recall: 0.5948275862068966
F1 Score: 0.6634615384615384

Fold 4:
Accuracy: 0.6223175965665236
Precision: 0.7592592592592593
Recall: 0.35344827586206895
F1 Score: 0.4823529411764706

Fold 5:
Accuracy: 0.6594827586206896
Precision: 0.7936507936507936
Recall: 0.43103448275862066
F1 Score: 0.5586592178770949

Average Accuracy: 0.6872613585910907
Average Precision: 0.7648734467826591
Average Recall: 0.5425287356321838
Average F1 Score: 0.6249977295362434

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8927038626609443
Preci

{'Model': ['Naïve Bayes', 'SVM', 'Logistic Regression', 'Random Forest'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 0.1, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}],
 'Average Accuracy': [0.5128903359479058,
  0.866952789699571,
  0.6872613585910907,
  0.8925928666568004],
 'Average Precision': [0.5165470987368339,
  0.7911184210526319,
  0.7648734467826591,
  0.9495197029509281],
 'Average Recall': [0.8013115237253169,
  1.0,
  0.5425287356321838,
  0.8365016209843796],
 'Average F1 Score': [0.5949016897264999,
  0.8832200999871809,
  0.6249977295362434,
  0.8832001634469773]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.51289,0.516547,0.801312,0.594902
1,SVM,"{'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf', 'ma...",0.866953,0.791118,1.0,0.88322
2,Logistic Regression,"{'C': 0.1, 'gamma': None, 'kernel': None, 'max...",0.687261,0.764873,0.542529,0.624998
3,Random Forest,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.892593,0.94952,0.836502,0.8832


In [33]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [34]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.289034,51.65471,80.131152,59.490169
SVM,86.695279,79.111842,100.0,88.32201
Logistic Regression,68.726136,76.487345,54.252874,62.499773
Random Forest,89.259287,94.95197,83.650162,88.320016


In [35]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.3%,51.7%,80.1%,59.5%
SVM,86.7%,79.1%,100.0%,88.3%
Logistic Regression,68.7%,76.5%,54.3%,62.5%
Random Forest,89.3%,95.0%,83.7%,88.3%
