In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report

# Step 1: Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Step 2: Feature Engineering
train_copy = train.copy()
test_copy = test.copy()

# Extract day, month, and hour from date/time columns
for df in [train_copy, test_copy]:
    df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')
    df['trans_time'] = pd.to_datetime(df['trans_time'], errors='coerce', format='%H:%M:%S').dt.hour
    df['day'] = df['trans_date'].dt.day
    df['month'] = df['trans_date'].dt.month
    df['year'] = df['trans_date'].dt.year

# Calculate age
for df in [train_copy, test_copy]:
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age'] = df['year'] - df['dob'].dt.year

# Calculate distance between cardholder and merchant
for df in [train_copy, test_copy]:
    df['distance'] = df.apply(
        lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).km, axis=1
    )

# Encode categorical variables
label_encoders = {}
for col in ['category', 'state', 'gender', 'job']:
    le = LabelEncoder()
    le.fit(train_copy[col].fillna('Unknown'))
    train_copy[col] = le.transform(train_copy[col].fillna('Unknown'))
    test_copy[col] = le.transform(test_copy[col].fillna('Unknown'))
    label_encoders[col] = le

# Drop unnecessary columns
columns_to_drop = ['trans_num', 'trans_date', 'cc_num', 'first', 'last', 'street', 'city', 'zip', 'dob', 'merchant']
train_copy = train_copy.drop(columns=columns_to_drop)
test_copy = test_copy.drop(columns=columns_to_drop)

# Step 3: Train-Test Split
X = train_copy.drop(columns=['is_fraud'])
y = train_copy['is_fraud']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Feature Scaling
scaler = StandardScaler()
numerical_cols = ['amt', 'distance', 'age']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])
test_copy[numerical_cols] = scaler.transform(test_copy[numerical_cols])

# Step 5: Model Training with LightGBM
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 6: Validate the Model
y_pred = model.predict(X_valid)
print("F1 Score:", f1_score(y_valid, y_pred))
print("Classification Report:")
print(classification_report(y_valid, y_pred))

# Step 7: Make Predictions on Test Data
test_predictions = model.predict(test_copy)

# Step 8: Create Submission File
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' created successfully!")


[LightGBM] [Info] Number of positive: 33750, number of negative: 262812
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2738
[LightGBM] [Info] Number of data points in the train set: 296562, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113804 -> initscore=-2.052459
[LightGBM] [Info] Start training from score -2.052459
F1 Score: 0.973172327642615
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     65592
           1       0.99      0.96      0.97      8549

    accuracy                           0.99     74141
   macro avg       0.99      0.98      0.98     74141
weighted avg       0.99      0.99      0.99     74141

Submission file 'submission.csv' created successfully!
