In [21]:
####################################################################################################
### Binary Classification with a Tabular Credit Card Fraud Dataset
### link: https://www.kaggle.com/competitions/playground-series-s3e4/overview
####################################################################################################

# Importing the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Importing the dataset
train_data = pd.read_csv(r'train.csv',index_col=0)
test_data  = pd.read_csv(r'test.csv',index_col=0)

# Printing the number of rows of the train and test data
print("train_data = " + str(len(train_data)))
print("test_data = " + str(len(test_data)))

# Printing the number of 0s and 1s in the train data for the Class column
print(train_data['Class'].value_counts())

train_data = 219129
test_data = 146087
Class
0    218660
1       469
Name: count, dtype: int64


In [22]:
# Separate features (X) and target variable (y) from the train dataset
X_train = train_data.drop(columns=['Class'])  # Features
y_train = train_data['Class']  # Target variable

# Define resampling strategy based on the available number of samples for each class
over_sampling = SMOTE(sampling_strategy={1: min(5000, y_train.value_counts()[1])})
under_sampling = RandomUnderSampler(sampling_strategy={0: min(5000, y_train.value_counts()[0])})

# Create a pipeline for resampling
sampling_pipeline = Pipeline([
    ('over_sampling', over_sampling),
    ('under_sampling', under_sampling)
])

# Apply resampling to the training data
X_train_resampled, y_train_resampled = sampling_pipeline.fit_resample(X_train, y_train)

# Initialize the XGBoost classifier
model = XGBClassifier()

# Train the model on the resampled data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
test_predictions = model.predict_proba(test_data)[:, 1]  # Predicted probabilities of class 1 for the test data

# Save the predicted probabilities to a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_data.index,  # Assuming the index is the 'id' column in the test data
    'Class': test_predictions  # Predicted probabilities for the 'Class'
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)