In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE  # Importing SMOTE
from scipy.stats import zscore

# Load your dataset (update file paths as needed)
train_data = pd.read_csv('C:\\Users\\hamad\\Downloads\\fda_trainingset.csv')
test_data = pd.read_csv('C:\\Users\\hamad\\Downloads\\fda_testset.csv')
# Separate features and target
X_train = train_data.drop(columns=['ID', 'Y'], errors='ignore')  # Update 'Y' to match your target column name
y_train = train_data['Y']
X_test = test_data.drop(columns=['ID', 'Y'], errors='ignore')
y_test = test_data['Y'] if 'Y' in test_data.columns else None

# Impute missing values using SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Standardize features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Check class balance
print("Class distribution in training data:")
print(y_train.value_counts())

# Apply SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check class balance after applying SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, stratify=y_train_resampled)

# Train Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.9, random_state=42)
gb_clf.fit(X_train, y_train)

# Predictions and probabilities
y_pred = gb_clf.predict(X_val)
y_pred_proba = gb_clf.predict_proba(X_val)[:, 1]

# Evaluation Metrics
roc_auc = roc_auc_score(y_val, y_pred_proba)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)



Class distribution in training data:
Y
0    199487
1       513
Name: count, dtype: int64


[WinError 2] The system cannot find the file specified
  File "C:\Users\hamad\Downloads\anaconda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\hamad\Downloads\anaconda\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hamad\Downloads\anaconda\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\hamad\Downloads\anaconda\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Class distribution after SMOTE:
Y
0    199487
1    199487
Name: count, dtype: int64


In [2]:
# Print Metrics
print("\nGradient Boosting Evaluation Metrics:")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")




Gradient Boosting Evaluation Metrics:
ROC AUC: 0.9995
Precision: 0.9954
Recall: 0.9981
F1 Score: 0.9968
Accuracy: 0.9968


In [3]:
# Create predictions for the test set
y_test_proba = gb_clf.predict_proba(X_test)[:, 1]  # Predicted probabilities for test set

# Create DataFrame for submission
submission_df = pd.DataFrame({
    'ID': test_data['ID'],  # Assuming 'ID' exists in the test set
    'Prediction': y_test_proba
})

# Save predictions to CSV
submission_df.to_csv('GradientBoosting_predictions_smote.csv', index=False)
print("Submission file 'GradientBoosting_predictions_smote.csv' created successfully.")


Submission file 'GradientBoosting_predictions_smote.csv' created successfully.
