## Imports

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier

## Loading the datasets

In [3]:
training_set = pd.read_csv("./train.csv")
testing_set = pd.read_csv("./test.csv")
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission_copy = sample_submission.copy()

## Feature Extraction

In [4]:
def add_features_to(df):
    df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')
    df['day'] = df['trans_date'].dt.dayofweek
    df['is_weekend'] = df['day'].apply(lambda x: 1 if x in [5, 6] else 0)

    df['trans_time'] = pd.to_datetime(df['trans_time'], errors='coerce', format='%H:%M:%S')
    df['hour'] = df['trans_time'].dt.hour
    
    df['distance_diff'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)
    df['log_amt'] = np.log1p(df['amt'])
    return df

training_set = add_features_to(training_set)
testing_set = add_features_to(testing_set)

## Feature Selection

In [5]:
cat_features = ['category', 'gender', 'state', 'job']
for c in cat_features:
    label_encoder = LabelEncoder()
    training_set[c] = label_encoder.fit_transform(training_set[c].astype(str))
    testing_set[c] = label_encoder.transform(testing_set[c].astype(str))

dropped_columns = ['street', 'city', 'zip', 'trans_date', 'trans_time', 'trans_num', 'first', 'last', 'dob', 'merchant']
X = training_set.drop(['is_fraud'] + dropped_columns, axis=1)
y = training_set['is_fraud']
X_test = testing_set.drop(dropped_columns, axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)



## Sample + Split into training and testing set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

## Model Creation

In [7]:
rf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced', n_jobs=-1)
xgb = XGBClassifier(n_estimators=100, max_depth=10, random_state=42, use_label_encoder=False, eval_metric='logloss')
gbm = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)

In [None]:
meta_model = LogisticRegression(max_iter=1000)

model = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('gbm', gbm)],
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



## Model Evaluation

In [None]:
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"F1-Score: {f1}")
print(F"Accuracy: {accuracy}")

F1-Score: 0.995444642503466
Accuracy: 0.995447694158128


## Creating submission file

In [None]:
predictions = model.predict(X_test)

sample_submission_copy['is_fraud'] = predictions
sample_submission_copy.to_csv("./submission.csv", index=False)
print("Submission file created: submission.csv")

NameError: name 'model' is not defined