In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# Read the data into a dataframe
df = pd.read_csv('cleaned_sentiment_data.csv')

# Separate the features and target variable
X = df[['total_positive_direct_mentions',
        'total_negative_direct_mentions', 
        'total_positive_indirect_mentions',
        'total_negative_indirect_mentions', 
        'soft_cap']]
y = df['ico_success']

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Define the parameter grids for grid search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10]
}

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

nb_param_grid = {}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    'kernel': ['rbf']
}

# Define the models
models = {
    "Naïve Bayes": (GaussianNB(), nb_param_grid),
    "SVM": (SVC(random_state=42), svm_param_grid),
    "Logistic Regression": (LogisticRegression(random_state=42), lr_param_grid),
    "Random Forest": (RandomForestClassifier(random_state=42), rf_param_grid),
}

result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

    result["Average Accuracy"].append(avg_accuracy)
    result["Average Precision"].append(avg_precision)
    result["Average Recall"].append(avg_recall)
    result["Average F1 Score"].append(avg_f1)

result

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.51931330472103
Precision: 0.5109170305676856
Recall: 1.0
F1 Score: 0.6763005780346821

Fold 2:
Accuracy: 0.5278969957081545
Precision: 0.515695067264574
Recall: 0.9829059829059829
F1 Score: 0.6764705882352942

Fold 3:
Accuracy: 0.5236051502145923
Precision: 0.511520737327189
Recall: 0.9568965517241379
F1 Score: 0.6666666666666666

Fold 4:
Accuracy: 0.4978540772532189
Precision: 0.4975369458128079
Recall: 0.8706896551724138
F1 Score: 0.6332288401253918

Fold 5:
Accuracy: 0.5258620689655172
Precision: 0.5135135135135135
Recall: 0.9827586206896551
F1 Score: 0.6745562130177515

Average Accuracy: 0.5189063193725025
Average Precision: 0.509836658897154
Average Recall: 0.9586501620984379
Average F1 Score: 0.6654445772159572

Model: SVM
Best Parameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8497854077253219
Precision: 0.7697368421052632
Recall: 1.0
F1 Score:

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 0.1, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.6695278969957081
Precision: 0.6818181818181818
Recall: 0.6410256410256411
F1 Score: 0.6607929515418502

Fold 2:
Accuracy: 0.7296137339055794
Precision: 0.7934782608695652
Recall: 0.6239316239316239
F1 Score: 0.6985645933014354

Fold 3:
Accuracy: 0.7553648068669528
Precision: 0.7920792079207921
Recall: 0.6896551724137931
F1 Score: 0.7373271889400922

Fold 4:
Accuracy: 0.5708154506437768
Precision: 0.6428571428571429
Recall: 0.3103448275862069
F1 Score: 0.4186046511627907

Fold 5:
Accuracy: 0.6724137931034483
Precision: 0.7631578947368421
Recall: 0.5
F1 Score: 0.6041666666666666

Average Accuracy: 0.679547136303093
Average Precision: 0.7346781376405047
Average Recall: 0.552991452991453
Average F1 Score: 0.623891210322567

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8369098712446352
Precision: 

{'Model': ['Naïve Bayes', 'SVM', 'Logistic Regression', 'Random Forest'],
 'Best Parameters': [{},
  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},
  {'C': 0.1, 'penalty': 'l2'},
  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}],
 'Average Accuracy': [0.5189063193725025,
  0.866952789699571,
  0.679547136303093,
  0.8470882048246263],
 'Average Precision': [0.509836658897154,
  0.7911184210526319,
  0.7346781376405047,
  0.908331372991498],
 'Average Recall': [0.9586501620984379,
  1.0,
  0.552991452991453,
  0.7849248452696729],
 'Average F1 Score': [0.6654445772159572,
  0.8832200999871809,
  0.623891210322567,
  0.8327648258446091]}

In [2]:
from datasets import Dataset

perf_ds = Dataset.from_dict(result).to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
perf_ds

Unnamed: 0,Model,Best Parameters,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,Naïve Bayes,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.518906,0.509837,0.95865,0.665445
1,SVM,"{'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf', 'ma...",0.866953,0.791118,1.0,0.88322
2,Logistic Regression,"{'C': 0.1, 'gamma': None, 'kernel': None, 'max...",0.679547,0.734678,0.552991,0.623891
3,Random Forest,"{'C': None, 'gamma': None, 'kernel': None, 'ma...",0.847088,0.908331,0.784925,0.832765


In [4]:
res2 = perf_ds.drop(columns=["Best Parameters"])

In [5]:
res2 = res2.set_index(res2.columns[0]).mul(100)
res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.890632,50.983666,95.865016,66.544458
SVM,86.695279,79.111842,100.0,88.32201
Logistic Regression,67.954714,73.467814,55.299145,62.389121
Random Forest,84.70882,90.833137,78.492485,83.276483


In [6]:
for key in res2.select_dtypes(include=["number"]).columns:
    res2[key] = res2[key].apply("{:.1f}%".format)

res2

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1 Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Naïve Bayes,51.9%,51.0%,95.9%,66.5%
SVM,86.7%,79.1%,100.0%,88.3%
Logistic Regression,68.0%,73.5%,55.3%,62.4%
Random Forest,84.7%,90.8%,78.5%,83.3%
