In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# Read the data into a dataframe
df = pd.read_csv('cleaned_engagement_data.csv')

# Separate the features and target variable
X = df[['total_direct_mentions', 
        'total_indirect_mentions', 
        'total_likes', 
        'total_retweets', 
        'total_project_followers', 
        'total_indirect_followers', 
        'soft_cap']]
y = df['ico_success']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Define the parameter grids for grid search
catboost_param_grid = {
        'iterations': [ 1000],
        'learning_rate': [0.03 ],
        'depth': [ 10]
    },


# Define the models
models = {
    'CatBoost': (CatBoostClassifier(), catboost_param_grid),
 
}
result = {
    "Model": [],
    "Best Parameters": [],
    "Average Accuracy": [],
    "Average Precision": [],
    "Average Recall": [],
    "Average F1 Score": [],
}

best_estimators = {}

# Perform grid search and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)

    # Perform grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, scoring="accuracy"
    )
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters and the corresponding score
    print("Best Parameters: ", grid_search.best_params_)
    print()

    result["Best Parameters"].append(grid_search.best_params_)

    best_estimators[model_name] = grid_search.best_estimator_
best_estimators


result

Model: CatBoost
0:	learn: 0.6795038	total: 194ms	remaining: 3m 13s
1:	learn: 0.6668505	total: 236ms	remaining: 1m 58s
2:	learn: 0.6566102	total: 279ms	remaining: 1m 32s
3:	learn: 0.6460204	total: 318ms	remaining: 1m 19s
4:	learn: 0.6366464	total: 365ms	remaining: 1m 12s
5:	learn: 0.6278677	total: 410ms	remaining: 1m 7s
6:	learn: 0.6191595	total: 454ms	remaining: 1m 4s
7:	learn: 0.6088914	total: 497ms	remaining: 1m 1s
8:	learn: 0.5984181	total: 538ms	remaining: 59.2s
9:	learn: 0.5901397	total: 581ms	remaining: 57.5s
10:	learn: 0.5803248	total: 626ms	remaining: 56.3s
11:	learn: 0.5724414	total: 676ms	remaining: 55.7s
12:	learn: 0.5648522	total: 721ms	remaining: 54.7s
13:	learn: 0.5568583	total: 762ms	remaining: 53.7s
14:	learn: 0.5510131	total: 806ms	remaining: 52.9s
15:	learn: 0.5435881	total: 849ms	remaining: 52.2s
16:	learn: 0.5366608	total: 887ms	remaining: 51.3s
17:	learn: 0.5306891	total: 929ms	remaining: 50.7s
18:	learn: 0.5250129	total: 967ms	remaining: 49.9s
19:	learn: 0.5210853

UsageError: Line magic function `%%time` not found.


In [2]:
%%time
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

result = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
}

for model_name in best_estimators:
    model = best_estimators[model_name]
    print(f"Model: {model_name}")
    print("=" * 50)
    result["Model"].append(model_name)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F-measure:", f_measure)

    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1 Score"].append(f_measure)

Model: CatBoost
Accuracy: 0.7621951219512195
Precision: 0.765625
Recall: 0.9158878504672897
F-measure: 0.8340425531914893
Wall time: 12.7 ms
