In [1]:
# LightGBM and Bayesian Optimization Method 2
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import uniform, randint

# Load the data
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Extract features and labels
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_df_scaled = scaler.transform(test_df)

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', early_stopping_rounds=10)

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 6),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 0.1),
    'reg_lambda': uniform(0.8, 1.2)
}

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=50, scoring='roc_auc', cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=False)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the model with the best parameters
best_xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', **best_params)
best_xgb_model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=False, early_stopping_rounds=10)

# Predict and evaluate on the validation set
y_pred_xgb = best_xgb_model.predict(X_val_scaled)
y_proba_xgb = best_xgb_model.predict_proba(X_val_scaled)[:, 1]

print("XGBoost Performance:")
print(classification_report(y_val, y_pred_xgb))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_xgb)}")

# Train on the full training data
best_xgb_model.fit(scaler.transform(X), y)

# Predict on the test set
test_predictions = best_xgb_model.predict(test_df_scaled)

# Prepare submission file
submission_df['label'] = test_predictions
submission_df.to_csv('submission4.csv', index=False)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


OSError: [Errno 9] Bad file descriptor