In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as xgb

In [66]:
import numpy as np
import pandas as pd

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Confirm the target
target_col = 'is_fraud'

# Convert trans_date to datetime
train['trans_date'] = pd.to_datetime(train['trans_date'])
test['trans_date'] = pd.to_datetime(test['trans_date'])

# Extract temporal features
train['hour'] = pd.to_datetime(train['trans_time'], format='%H:%M:%S').dt.hour
test['hour'] = pd.to_datetime(test['trans_time'], format='%H:%M:%S').dt.hour

train['day'] = train['trans_date'].dt.day
test['day'] = test['trans_date'].dt.day

train['day_of_week'] = train['trans_date'].dt.dayofweek
test['day_of_week'] = test['trans_date'].dt.dayofweek

# Calculate haversine distance
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

train['distance'] = haversine(train['long'], train['lat'], train['merch_long'], train['merch_lat'])
test['distance'] = haversine(test['long'], test['lat'], test['merch_long'], test['merch_lat'])

# Calculate age
train['dob'] = pd.to_datetime(train['dob'])
test['dob'] = pd.to_datetime(test['dob'])

train['age'] = (train['trans_date'] - train['dob']).dt.days / 365.25
test['age'] = (test['trans_date'] - test['dob']).dt.days / 365.25

# Encode categorical variables as integers
categorical_features = ['category', 'state', 'job', 'gender']
for col in categorical_features:
    # Create a combined series for consistent encoding
    combined = pd.concat([train[col], test[col]], axis=0)
    encoding = {value: idx + 1 for idx, value in enumerate(combined.dropna().unique())}  # Map categories to integers
    train[col] = train[col].map(encoding).fillna(0).astype(int)  # Fill missing with 0
    test[col] = test[col].map(encoding).fillna(0).astype(int)    # Fill missing with 0

# Drop unnecessary columns
drop_cols = ['trans_num', 'first', 'last', 'street', 'city', 'unix_time', 'trans_date', 'trans_time', 'merchant', 'dob']
train.drop(columns=drop_cols, inplace=True, errors='ignore')
test.drop(columns=drop_cols, inplace=True, errors='ignore')

# Handle missing values
for col in train.select_dtypes(include=[np.number]).columns:
    train[col].fillna(train[col].median(), inplace=True)
    if col in test.columns:
        test[col].fillna(train[col].median(), inplace=True)

# Explicitly list all features
all_features = [
    'hour', 'day', 'day_of_week', 'distance', 'age',
    'category', 'state', 'job', 'gender'  # Include the encoded categorical features
]

# Finalize the datasets
X_train = train[all_features]
y_train = train[target_col]
X_test = test[all_features]

# Display the shapes of the finalized datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (370703, 9)
y_train shape: (370703,)
X_test shape: (92676, 9)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set

In [67]:
X_train = train.drop('is_fraud', axis=1)
y_train = train['is_fraud']

X_test = test.copy()  # test does not have is_fraud

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    
    verbose=True
)
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Predict on the validation set
y_val_pred = xgb_model.predict(X_val)
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# Evaluate AUC
val_auc = roc_auc_score(y_val, y_val_proba)
print("Validation AUC:", val_auc)

# Additional metrics
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

y_pred = xgb_model.predict(X_val)
f1 = f1_score(y_val, y_pred)
print(f1)

preds = xgb_model.predict(X_test)
# Create a submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],  # Use the renamed 'id' column
    'is_fraud': preds
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

[0]	validation_0-logloss:0.23621
[1]	validation_0-logloss:0.18456
[2]	validation_0-logloss:0.15083
[3]	validation_0-logloss:0.12815
[4]	validation_0-logloss:0.11248
[5]	validation_0-logloss:0.09835
[6]	validation_0-logloss:0.08982
[7]	validation_0-logloss:0.08276
[8]	validation_0-logloss:0.07739
[9]	validation_0-logloss:0.07294
[10]	validation_0-logloss:0.06769
[11]	validation_0-logloss:0.06420
[12]	validation_0-logloss:0.06296
[13]	validation_0-logloss:0.06039
[14]	validation_0-logloss:0.05814
[15]	validation_0-logloss:0.05043
[16]	validation_0-logloss:0.04828
[17]	validation_0-logloss:0.04674
[18]	validation_0-logloss:0.04190
[19]	validation_0-logloss:0.04015
[20]	validation_0-logloss:0.03940


Parameters: { "use_label_encoder" } are not used.



[21]	validation_0-logloss:0.03594
[22]	validation_0-logloss:0.03385
[23]	validation_0-logloss:0.03358
[24]	validation_0-logloss:0.03281
[25]	validation_0-logloss:0.03184
[26]	validation_0-logloss:0.03154
[27]	validation_0-logloss:0.03050
[28]	validation_0-logloss:0.02916
[29]	validation_0-logloss:0.02758
[30]	validation_0-logloss:0.02622
[31]	validation_0-logloss:0.02558
[32]	validation_0-logloss:0.02545
[33]	validation_0-logloss:0.02480
[34]	validation_0-logloss:0.02381
[35]	validation_0-logloss:0.02313
[36]	validation_0-logloss:0.02238
[37]	validation_0-logloss:0.02221
[38]	validation_0-logloss:0.02196
[39]	validation_0-logloss:0.02192
[40]	validation_0-logloss:0.02138
[41]	validation_0-logloss:0.02058
[42]	validation_0-logloss:0.02012
[43]	validation_0-logloss:0.02013
[44]	validation_0-logloss:0.02011
[45]	validation_0-logloss:0.01984
[46]	validation_0-logloss:0.01923
[47]	validation_0-logloss:0.01875
[48]	validation_0-logloss:0.01876
[49]	validation_0-logloss:0.01854
[50]	validatio

In [42]:
submission.head()

Unnamed: 0,trans_num,is_fraud
0,2e6b34f2047158280fd5b50cb5249fcc,0.000297
1,5e4c36e1e6f1838f0afe1ed83d42d48e,0.001277
2,de58b3413be0b956c261b8e756006b5d,0.00386
3,63e5e8954b6954121fb9395b8fb87ec3,0.999211
4,f0acdc291ca35b61a873060e419b20a5,0.002934
