In [19]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
# Read the data into a dataframe
df = pd.read_csv("cleaned_sentiment_data.csv")

# Separate the features and target variable
X = df[
    [
        "total_positive_direct_mentions",
        "total_negative_direct_mentions",
        "total_positive_indirect_mentions",
        "total_negative_indirect_mentions",
        "soft_cap",
    ]
]
y = df["ico_success"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Define the parameter grids for grid search
catboost_param_grid = {
        'iterations': [ 1000],
        'learning_rate': [0.03 ],
        'depth': [ 10]
    },


# Define the models
models = {
    'CatBoost': (CatBoostClassifier(), catboost_param_grid),
 
}
result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

best_estimators = {}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    best_estimators[model_name] = grid_search.best_estimator_

Model: CatBoost
0:	learn: 0.6828392	total: 37.7ms	remaining: 37.7s
1:	learn: 0.6689594	total: 74ms	remaining: 36.9s
2:	learn: 0.6556197	total: 122ms	remaining: 40.7s
3:	learn: 0.6457522	total: 157ms	remaining: 39.2s
4:	learn: 0.6353965	total: 198ms	remaining: 39.3s
5:	learn: 0.6266612	total: 234ms	remaining: 38.8s
6:	learn: 0.6181884	total: 270ms	remaining: 38.3s
7:	learn: 0.6084908	total: 305ms	remaining: 37.9s
8:	learn: 0.6001028	total: 346ms	remaining: 38.2s
9:	learn: 0.5914674	total: 382ms	remaining: 37.9s
10:	learn: 0.5836414	total: 414ms	remaining: 37.3s
11:	learn: 0.5753740	total: 451ms	remaining: 37.1s
12:	learn: 0.5684700	total: 486ms	remaining: 36.9s
13:	learn: 0.5612990	total: 523ms	remaining: 36.8s
14:	learn: 0.5545249	total: 558ms	remaining: 36.6s
15:	learn: 0.5499557	total: 592ms	remaining: 36.4s
16:	learn: 0.5427085	total: 657ms	remaining: 38s
17:	learn: 0.5339612	total: 702ms	remaining: 38.3s
18:	learn: 0.5300092	total: 737ms	remaining: 38s
19:	learn: 0.5231141	total: 7

In [17]:
best_estimators

{'CatBoost': <catboost.core.CatBoostClassifier at 0x2b482c2bd60>}

In [20]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

result = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
}

for model_name in best_estimators:
    model = best_estimators[model_name]
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F-measure:", f_measure)

    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1 Score"].append(f_measure)

result

Model: CatBoost
Accuracy: 0.7378048780487805
Precision: 0.753968253968254
Recall: 0.8878504672897196
F-measure: 0.8154506437768241
Wall time: 7.25 ms


{'Model': ['CatBoost'],
 'Accuracy': [0.7378048780487805],
 'Precision': [0.753968253968254],
 'Recall': [0.8878504672897196],
 'F1 Score': [0.8154506437768241]}