In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

# Read the data into a dataframe
df = pd.read_csv('cleaned_sentiment_data.csv')

# Separate the features and target variable
X = df[['total_positive_direct_mentions',
        'total_negative_direct_mentions', 
        'total_positive_indirect_mentions',
        'total_negative_indirect_mentions', 
        'soft_cap']]
y = df['ico_success']

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Define the parameter grids for grid search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10]
}

lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

nb_param_grid = {}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    'kernel': ['rbf']
}

# Define the models
models = {
    'Random Forest': (RandomForestClassifier(random_state=42), rf_param_grid),
    'Logistic Regression': (LogisticRegression(random_state=42), lr_param_grid),
    'Naïve Bayes': (GaussianNB(), nb_param_grid),
    'SVM': (SVC(random_state=42), svm_param_grid)
}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()

Model: Random Forest
Best Parameters:  {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8369098712446352
Precision: 0.8015267175572519
Recall: 0.8974358974358975
F1 Score: 0.8467741935483871

Fold 2:
Accuracy: 0.8369098712446352
Precision: 0.8319327731092437
Recall: 0.8461538461538461
F1 Score: 0.8389830508474577

Fold 3:
Accuracy: 0.9399141630901288
Precision: 0.9903846153846154
Recall: 0.8879310344827587
F1 Score: 0.9363636363636364

Fold 4:
Accuracy: 0.7639484978540773
Precision: 0.9295774647887324
Recall: 0.5689655172413793
F1 Score: 0.7058823529411765

Fold 5:
Accuracy: 0.8577586206896551
Precision: 0.9882352941176471
Recall: 0.7241379310344828
F1 Score: 0.8358208955223881

Average Accuracy: 0.8470882048246263
Average Precision: 0.908331372991498
Average Recall: 0.7849248452696729
Average F1 Score: 0.8327648258446091

Model: Logistic Regression


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/inflaton/miniconda3/envs/cs701/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/inflaton/miniconda3/envs/cs701/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/inflaton/miniconda3/envs/cs701/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalt

Best Parameters:  {'C': 0.1, 'penalty': 'l2'}

Cross-Validation Results:
Fold 1:
Accuracy: 0.6394849785407726
Precision: 0.6633663366336634
Recall: 0.5726495726495726
F1 Score: 0.6146788990825689

Fold 2:
Accuracy: 0.7467811158798283
Precision: 0.7843137254901961
Recall: 0.6837606837606838
F1 Score: 0.7305936073059361

Fold 3:
Accuracy: 0.703862660944206
Precision: 0.7582417582417582
Recall: 0.5948275862068966
F1 Score: 0.6666666666666666

Fold 4:
Accuracy: 0.5708154506437768
Precision: 0.6739130434782609
Recall: 0.2672413793103448
F1 Score: 0.38271604938271603

Fold 5:
Accuracy: 0.646551724137931
Precision: 0.7428571428571429
Recall: 0.4482758620689655
F1 Score: 0.5591397849462366

Average Accuracy: 0.661499186029303
Average Precision: 0.7245384013402043
Average Recall: 0.5133510167992927
Average F1 Score: 0.5907590014768248

Model: Naïve Bayes
Best Parameters:  {}

Cross-Validation Results:
Fold 1:
Accuracy: 0.51931330472103
Precision: 0.5109170305676856
Recall: 1.0
F1 Score: 0.67630

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier


models = {
    'XGBboost': XGBClassifier(random_state=42, learning_rate=0.03, max_depth=3, n_estimators=300, reg_lambda=2),
    'CatBoost': CatBoostClassifier(random_state=42, iterations=200, depth=4, loss_function='Logloss',
                                   l2_leaf_reg=1e-20, leaf_estimation_iterations=10, logging_level='Silent',
                                   learning_rate=0.03),
    'Random Forest': RandomForestClassifier(random_state=42),
}

# Define the parameter grids for grid search
param_grids = {
    'XGBboost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.1],
        'max_depth': [3, 5, 10],
        'reg_lambda': [1, 2, 5]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.1],
        'depth': [3, 5, 10]
    },
    'Random Forest': {},
}
    
# Perform grid search for each model
for model_name, model in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)

    # Get the parameter grid for the current model
    param_grid = param_grids[model_name]

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()
    
    # Perform 5-fold cross-validation with the best model
    cv_results = cross_validate(grid_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    print("Cross-Validation Results:")
    print("=" * 50)
    overfitted_folds = 0  # Counter for overfitted folds
    for fold_idx, fold_result in enumerate(cv_results['test_accuracy']):
        if fold_result == 1.0:  # Check for overfitted fold
            overfitted_folds += 1
            continue  # Skip overfitted fold

        print(f"Fold {fold_idx+1}:")
        print(f"Accuracy: {fold_result}")
        print(f"Precision: {cv_results['test_precision'][fold_idx]}")
        print(f"Recall: {cv_results['test_recall'][fold_idx]}")
        print(f"F1 Score: {cv_results['test_f1'][fold_idx]}")
        print()

    # Calculate average results across non-overfitted folds
    num_folds = len(cv_results['test_accuracy'])
    num_non_overfitted_folds = num_folds - overfitted_folds
    avg_accuracy = (sum(cv_results['test_accuracy']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_precision = (sum(cv_results['test_precision']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_recall = (sum(cv_results['test_recall']) - (overfitted_folds*1)) / num_non_overfitted_folds
    avg_f1 = (sum(cv_results['test_f1']) - (overfitted_folds*1)) / num_non_overfitted_folds

    # Print the average results
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1 Score: {avg_f1}")
    print()


Model: XGBboost
Best Parameters:  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'reg_lambda': 5}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8283261802575107
Precision: 0.808
Recall: 0.8632478632478633
F1 Score: 0.8347107438016529

Fold 2:
Accuracy: 0.8583690987124464
Precision: 0.868421052631579
Recall: 0.8461538461538461
F1 Score: 0.8571428571428572

Fold 3:
Accuracy: 0.927038626609442
Precision: 1.0
Recall: 0.853448275862069
F1 Score: 0.9209302325581395

Fold 4:
Accuracy: 0.8240343347639485
Precision: 1.0
Recall: 0.646551724137931
F1 Score: 0.7853403141361257

Fold 5:
Accuracy: 0.8577586206896551
Precision: 1.0
Recall: 0.7155172413793104
F1 Score: 0.8341708542713567

Average Accuracy: 0.8591053722066005
Average Precision: 0.9352842105263157
Average Recall: 0.784983790156204
Average F1 Score: 0.8464590003820265

Model: CatBoost
Best Parameters:  {'depth': 10, 'iterations': 200, 'learning_rate': 0.1}

Cross-Validation Results:
Fold 1:
Accuracy: 0.8454935622317596
Pre