In [3]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# Read the data into a dataframe
df = pd.read_csv('cleaned_engagement_data.csv')

# Separate the features and target variable
X = df[['total_direct_mentions', 
        'total_indirect_mentions', 
        'total_likes', 
        'total_retweets', 
        'total_project_followers', 
        'total_indirect_followers', 
        'soft_cap']]
y = df['ico_success']

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Define the parameter grids for grid search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10]
}

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

nb_param_grid = {}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    'kernel': ['rbf']
}

# Define the models
models = {
    'Random Forest': (RandomForestClassifier(random_state=42), rf_param_grid),
    'Logistic Regression': (LogisticRegression(random_state=42), lr_param_grid),
    'Naïve Bayes': (GaussianNB(), nb_param_grid),
    'SVM': (SVC(random_state=42), svm_param_grid)
}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8927038626609443
Precision: 0.8432835820895522
Recall: 0.9658119658119658
F1 Score: 0.9003984063745021

Fold 2:
Accuracy: 0.8884120171673819
Precision: 0.8823529411764706
Recall: 0.8974358974358975
F1 Score: 0.8898305084745762

Fold 3:
Accuracy: 0.9313304721030042
Precision: 0.9464285714285714
Recall: 0.9137931034482759
F1 Score: 0.9298245614035087

Fold 4:
Accuracy: 0.7811158798283262
Precision: 0.9113924050632911
Recall: 0.6206896551724138
F1 Score: 0.7384615384615385

Fold 5:
Accuracy: 0.853448275862069
Precision: 0.9880952380952381
Recall: 0.7155172413793104
F1 Score: 0.83

Average Accuracy: 0.869402101524345
Average Precision: 0.9143105475706246
Average Recall: 0.8226495726495726
Average F1 Score: 0.8577030029428251

Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'C': 10, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.6437768240343348
Precision: 0.6370967741935484
Recall: 0.6752136752136753
F1 Score: 0.6556016597510375

Fold 2:
Accuracy: 0.7553648068669528
Precision: 0.7941176470588235
Recall: 0.6923076923076923
F1 Score: 0.7397260273972601

Fold 3:
Accuracy: 0.7553648068669528
Precision: 0.8105263157894737
Recall: 0.6637931034482759
F1 Score: 0.7298578199052133

Fold 4:
Accuracy: 0.5364806866952789
Precision: 0.5606060606060606
Recall: 0.31896551724137934
F1 Score: 0.4065934065934066

Fold 5:
Accuracy: 0.5775862068965517
Precision: 0.618421052631579
Recall: 0.4051724137931034
F1 Score: 0.48958333333333337

Average Accuracy: 0.6537146662720141
Average Precision: 0.6841535700558969
Average Recall: 0.5510904804008252
Average F1 Score: 0.6042724493960502

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.5278969957081545
Precision: 0.5636363636363636
Recall: 0.264957264957264

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier

models = {
    'XGBboost': XGBClassifier(random_state=42, learning_rate=0.03, max_depth=3, n_estimators=300, reg_lambda=2),
    'CatBoost': CatBoostClassifier(random_state=42, iterations=200, depth=4, loss_function='Logloss',
                                   l2_leaf_reg=1e-20, leaf_estimation_iterations=10, logging_level='Silent',
                                   learning_rate=0.03),
    'Random Forest': RandomForestClassifier(random_state=42),
}

# Define the parameter grids for grid search
param_grids = {
    'XGBboost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.1],
        'max_depth': [3, 5, 10],
        'reg_lambda': [1, 2, 5]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.1],
        'depth': [3, 5, 10]
    },
    'Random Forest': {},
}
    
# Perform grid search for each model
for model_name, model in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)

    # Get the parameter grid for the current model
    param_grid = param_grids[model_name]

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()
    
    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()


Model: XGBboost
Best Parameters:  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'reg_lambda': 5}

Cross-Validation Results:
Fold 1:
Accuracy: 0.9141630901287554
Precision: 0.875968992248062
Recall: 0.9658119658119658
F1 Score: 0.9186991869918699

Fold 2:
Accuracy: 0.9012875536480687
Precision: 0.9051724137931034
Recall: 0.8974358974358975
F1 Score: 0.9012875536480686

Fold 3:
Accuracy: 0.944206008583691
Precision: 1.0
Recall: 0.8879310344827587
F1 Score: 0.9406392694063928

Fold 4:
Accuracy: 0.8283261802575107
Precision: 1.0
Recall: 0.6551724137931034
F1 Score: 0.7916666666666666

Fold 5:
Accuracy: 0.853448275862069
Precision: 1.0
Recall: 0.7068965517241379
F1 Score: 0.8282828282828283

Average Accuracy: 0.888286221696019
Average Precision: 0.956228281208233
Average Recall: 0.8226495726495726
Average F1 Score: 0.8761151009991652

Model: CatBoost
Best Parameters:  {'depth': 10, 'iterations': 300, 'learning_rate': 0.03}

Cross-Validation Results:
Fold 1:
Accuracy: 0.918454