In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# Read the data into a dataframe
df = pd.read_csv('cleaned_engagement_data.csv')

# Separate the features and target variable
X = df[['total_direct_mentions', 
        'total_indirect_mentions', 
        'total_likes', 
        'total_retweets', 
        'total_project_followers', 
        'total_indirect_followers', 
        'soft_cap']]
y = df['ico_success']

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Define the parameter grids for grid search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10]
}

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

nb_param_grid = {}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    'kernel': ['rbf']
}

# Define the models
models = {
    'Naïve Bayes': (GaussianNB(), nb_param_grid),
    'SVM': (SVC(random_state=42), svm_param_grid),
    'Logistic Regression': (LogisticRegression(random_state=42), lr_param_grid),
    'Random Forest': (RandomForestClassifier(random_state=42), rf_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.5278969957081545
Precision: 0.5636363636363636
Recall: 0.26495726495726496
F1 Score: 0.36046511627906974

Fold 2:
Accuracy: 0.51931330472103
Precision: 0.5111111111111111
Recall: 0.9829059829059829
F1 Score: 0.672514619883041

Fold 3:
Accuracy: 0.51931330472103
Precision: 0.5092592592592593
Recall: 0.9482758620689655
F1 Score: 0.6626506024096386

Fold 4:
Accuracy: 0.48497854077253216
Precision: 0.4898989898989899
Recall: 0.8362068965517241
F1 Score: 0.6178343949044586

Fold 5:
Accuracy: 0.5129310344827587
Precision: 0.5066666666666667
Recall: 0.9827586206896551
F1 Score: 0.6686217008797654

Average Accuracy: 0.512886636081101
Average Precision: 0.5161144781144781
Average Recall: 0.8030209254347185
Average F1 Score: 0.5964172868711947

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8497854077253219
Precision: 0.7697368421052632
Reca

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 10, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.648068669527897
Precision: 0.6422764227642277
Recall: 0.6752136752136753
F1 Score: 0.6583333333333333

Fold 2:
Accuracy: 0.7553648068669528
Precision: 0.7941176470588235
Recall: 0.6923076923076923
F1 Score: 0.7397260273972602

Fold 3:
Accuracy: 0.7510729613733905
Precision: 0.8085106382978723
Recall: 0.6551724137931034
F1 Score: 0.7238095238095238

Fold 4:
Accuracy: 0.5107296137339056
Precision: 0.5142857142857142
Recall: 0.3103448275862069
F1 Score: 0.3870967741935484

Fold 5:
Accuracy: 0.5991379310344828
Precision: 0.6493506493506493
Recall: 0.43103448275862066
F1 Score: 0.5181347150259067

Average Accuracy: 0.6528747965073257
Average Precision: 0.6817082143514575
Average Recall: 0.5528146183318597
Average F1 Score: 0.6054200747519145

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Cross-Validation Results:
Fold 1:
Accuracy: 0.89270386266

{'Model': ['Naïve Bayes', 'SVM', 'Logistic Regression', 'Random Forest'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 10, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}],
 'Average Accuracy': [0.512886636081101,
  0.866952789699571,
  0.6528747965073257,
  0.869402101524345],
 'Average Precision': [0.5161144781144781,
  0.7911184210526319,
  0.6817082143514575,
  0.9143105475706246],
 'Average Recall': [0.8030209254347185,
  1.0,
  0.5528146183318597,
  0.8226495726495726],
 'Average F1 Score': [0.5964172868711947,
  0.8832200999871809,
  0.6054200747519145,
  0.8577030029428251]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.512887,0.516114,0.803021,0.596417
1,SVM,"{'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf', 'ma...",0.866953,0.791118,1.0,0.88322
2,Logistic Regression,"{'C': 10.0, 'gamma': None, 'kernel': None, 'ma...",0.652875,0.681708,0.552815,0.60542
3,Random Forest,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.869402,0.914311,0.82265,0.857703


In [4]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [5]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.288664,51.611448,80.302093,59.641729
SVM,86.695279,79.111842,100.0,88.32201
Logistic Regression,65.28748,68.170821,55.281462,60.542007
Random Forest,86.94021,91.431055,82.264957,85.7703


In [6]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.3%,51.6%,80.3%,59.6%
SVM,86.7%,79.1%,100.0%,88.3%
Logistic Regression,65.3%,68.2%,55.3%,60.5%
Random Forest,86.9%,91.4%,82.3%,85.8%
