In [21]:
####################################################################################################
### Binary Classification with a Tabular Credit Card Fraud Dataset
### link: https://www.kaggle.com/competitions/playground-series-s3e4/overview
####################################################################################################

# Importing the libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Importing the dataset
train_data = pd.read_csv(r'sf-crime/train.csv',index_col=0)
test_data  = pd.read_csv(r'sf-crime/test.csv',index_col=0)

# Printing the number of rows of the train and test data
print("train_data = " + str(len(train_data)))
print("test_data = " + str(len(test_data)))

train_data = 878049
test_data = 884262


In [23]:
# Identify non-numerical columns in train_data
non_numerical_columns_train = train_data.select_dtypes(exclude=['number']).columns.tolist()

# Identify non-numerical columns in test_data
non_numerical_columns_test = test_data.select_dtypes(exclude=['number']).columns.tolist()

print("Non-Numerical Columns in Train Data:", non_numerical_columns_train)
print("Non-Numerical Columns in Test Data:", non_numerical_columns_test)

Non-Numerical Columns in Train Data: ['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address']
Non-Numerical Columns in Test Data: ['Dates', 'DayOfWeek', 'PdDistrict', 'Address']


In [25]:
# Encode categorical variables in training data
label_encoder = LabelEncoder()
train_data['DayOfWeekLabel'] = label_encoder.fit_transform(train_data['DayOfWeek'])
test_data['DayOfWeekLabel'] = label_encoder.transform(test_data['DayOfWeek'])

train_data['PdDistrictLabel'] = label_encoder.fit_transform(train_data['PdDistrict'])
test_data['PdDistrictLabel'] = label_encoder.transform(test_data['PdDistrict'])

train_data['CategoryLabel'] = label_encoder.fit_transform(train_data['Category'])
train_data['DescriptLabel'] = label_encoder.fit_transform(train_data['Descript'])
train_data['ResolutionLabel'] = label_encoder.fit_transform(train_data['Resolution'])

# Prepare features and target variable for training
X_train = train_data[['DayOfWeekLabel', 'PdDistrictLabel', 'X', 'Y']]  # Choose relevant features

y_train = train_data['CategoryLabel']  # Target variable

# Initialize XGBoost Classifier
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=len(label_encoder.classes_), #num_class=len(train_data['Category'].unique()),
    max_depth=6,  # Adjust max_depth
    learning_rate=0.1,  # Adjust learning_rate
    n_estimators=100,  # Adjust number of estimators
    random_state=42
)

# Train the model
xgb.fit(X_train, y_train)

# Preprocess test data
X_test = test_data[['DayOfWeekLabel', 'PdDistrictLabel', 'X', 'Y']]  # Choose relevant features

# Make predictions on test data
predictions_proba = xgb.predict_proba(X_test)

# Create DataFrame for submission
submission = pd.DataFrame(predictions_proba, columns=xgb.classes_)

# Replace extreme values in predicted probabilities
epsilon = 1e-15
submission.iloc[:, 1:] = submission.iloc[:, 1:].clip(epsilon, 1 - epsilon)

# Normalize predicted probabilities
submission.iloc[:, 1:] = submission.iloc[:, 1:].div(submission.iloc[:, 1:].sum(axis=1), axis=0)

# Save predictions to a CSV file
submission.to_csv('submission.csv', index=False)
