In [11]:
# categorical boosting method
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the data
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Extract features and labels
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_df_scaled = scaler.transform(test_df)

# Initialize the CatBoostClassifier
catboost_model = CatBoostClassifier(verbose=0)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128]
}

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(catboost_model, param_distributions=param_dist, n_iter=50, scoring='roc_auc', cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the model with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, verbose=0)
best_catboost_model.fit(X_train_scaled, y_train, eval_set=(X_val_scaled, y_val), early_stopping_rounds=10, verbose=100)

# Predict and evaluate on the validation set
y_pred_catboost = best_catboost_model.predict(X_val_scaled)
y_proba_catboost = best_catboost_model.predict_proba(X_val_scaled)[:, 1]

print("CatBoost Performance:")
print(classification_report(y_val, y_pred_catboost))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_catboost)}")

# Train on the full training data
best_catboost_model.fit(scaler.transform(X), y)

# Predict on the test set
test_predictions = best_catboost_model.predict(test_df_scaled)

# Prepare submission file
submission_df['label'] = test_predictions
submission_df.to_csv('submission5.csv', index=False)



# submission_df.to_csv('submission5.csv', index=False)

# train_df = pd.read_csv('train_tfidf_features.csv')
# test_df = pd.read_csv('test_tfidf_features.csv')
# submission_df = pd.read_csv('sample_submission.csv')


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 7, 'border_count': 128}
0:	learn: 0.6847517	test: 0.6845571	best: 0.6845571 (0)	total: 153ms	remaining: 45.9s
100:	learn: 0.5541464	test: 0.5753206	best: 0.5753206 (100)	total: 15.3s	remaining: 30.2s
200:	learn: 0.4985279	test: 0.5557573	best: 0.5557573 (200)	total: 29.8s	remaining: 14.7s
299:	learn: 0.4603647	test: 0.5483089	best: 0.5483089 (299)	total: 43.4s	remaining: 0us

bestTest = 0.5483089353
bestIteration = 299

CatBoost Performance:
              precision    recall  f1-score   support

           0       0.73      0.87      0.79      2153
           1       0.68      0.45      0.54      1284

    accuracy                           0.72      3437
   macro avg       0.70      0.66      0.67      3437
weighted avg       0.71      0.72      0.70      3437

ROC AUC: 0.7739186645309812
