In [None]:
pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, precision_recall_curve, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
import gc
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load the Data
train_identity = pd.read_csv(r'/content/train_identity.csv')
train_transaction = pd.read_csv(r"/content/train_transaction.csv")
test_identity = pd.read_csv(r"/content/test_identity.csv")
test_transaction = pd.read_csv(r"/content/test_transaction.csv")

# Fix column names in test_identity
test_identity.columns = [col.replace('id-', 'id_') for col in test_identity.columns]

# Combine datasets
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Store TransactionIDs for test set submission
test_ids = test['TransactionID'].copy()

# Clean up memory
del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Step 2: Preprocessing
# Remove features with only one unique value (based on train only)
one_unique_col = [col for col in train.columns if train[col].nunique() == 1]
train.drop(columns=one_unique_col, inplace=True)
test.drop(columns=one_unique_col, inplace=True)

# Define categorical columns
base_categorical_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain']
m_cols = [f'M{i}' for i in range(1, 10) if f'M{i}' in train.columns]
categorical_cols = base_categorical_cols + m_cols

# Verify categorical columns exist
categorical_cols = [col for col in categorical_cols if col in train.columns]

# Handle missing values
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['isFraud', 'TransactionID']]
for col in numerical_cols:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(train[col].median(), inplace=True)  # Use train median for test

for col in categorical_cols:
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)

# Remove outliers using IQR (train only)
for col in ['TransactionAmt', 'dist1', 'dist2']:
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1
    train = train[~((train[col] < (Q1 - 1.5 * IQR)) | (train[col] > (Q3 + 1.5 * IQR)))]

# Step 3: Feature Engineering
# Time-based features
train['day'] = ((train['TransactionDT'] // (3600 * 24) - 1) % 7) + 1
test['day'] = ((test['TransactionDT'] // (3600 * 24) - 1) % 7) + 1
train['hour'] = ((train['TransactionDT'] // 3600) % 24) + 1
test['hour'] = ((test['TransactionDT'] // 3600) % 24) + 1

def new_hr_feature(hr):
    if hr >= 7 and hr < 10:
        return "highwarningsign"
    elif hr >= 14 and hr < 16:
        return "lowestwarningsign"
    elif (hr >= 4 and hr < 7) or (hr >= 10 and hr < 14):
        return "mediumwarningsign"
    else:
        return "lowwarningsign"

train['hour_warning'] = train['hour'].apply(new_hr_feature)
test['hour_warning'] = test['hour'].apply(new_hr_feature)

# Log transform TransactionAmt
train['LogTransactionAmt'] = np.log(train['TransactionAmt'] + 1e-9)
test['LogTransactionAmt'] = np.log(test['TransactionAmt'] + 1e-9)

# New feature for card3
def new_card3(row):
    try:
        val = float(row)
        return 'Positive' if val > 160 else 'Negative'
    except:
        return 'missing'

train['new_card3'] = train['card3'].apply(new_card3)
test['new_card3'] = test['card3'].apply(new_card3)

# Simplify card6
def replacetodebit(row):
    if row in ['debit or credit', 'charge card']:
        return 'debit'
    return row

train['card6'] = train['card6'].apply(replacetodebit)
test['card6'] = test['card6'].apply(replacetodebit)

# Group email domains
for dataset in [train, test]:
    dataset.loc[dataset['P_emaildomain'].isin(['gmail.com', 'gmail']), 'P_emaildomain'] = 'Google'
    dataset.loc[dataset['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx', 'yahoo.co.uk',
                                              'yahoo.co.jp', 'yahoo.de', 'yahoo.fr', 'yahoo.es']),
                'P_emaildomain'] = 'Yahoo'
    dataset.loc[dataset['P_emaildomain'].isin(['hotmail.com', 'outlook.com', 'msn.com',
                                              'live.com.mx', 'hotmail.es', 'hotmail.co.uk',
                                              'hotmail.de', 'outlook.es', 'live.com',
                                              'live.fr', 'hotmail.fr']),
                'P_emaildomain'] = 'Microsoft'
    dataset.loc[dataset['P_emaildomain'].isin(dataset['P_emaildomain'].value_counts()[
                dataset['P_emaildomain'].value_counts() <= 500].index),
                'P_emaildomain'] = 'Others'
    dataset['P_emaildomain'].fillna('NoInf', inplace=True)

    dataset.loc[dataset['R_emaildomain'].isin(['gmail.com', 'gmail']), 'R_emaildomain'] = 'Google'
    dataset.loc[dataset['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx', 'yahoo.co.uk',
                                              'yahoo.co.jp', 'yahoo.de', 'yahoo.fr', 'yahoo.es']),
                'R_emaildomain'] = 'Yahoo'
    dataset.loc[dataset['R_emaildomain'].isin(['hotmail.com', 'outlook.com', 'msn.com',
                                              'live.com.mx', 'hotmail.es', 'hotmail.co.uk',
                                              'hotmail.de', 'outlook.es', 'live.com',
                                              'live.fr', 'hotmail.fr']),
                'R_emaildomain'] = 'Microsoft'
    dataset.loc[dataset['R_emaildomain'].isin(dataset['R_emaildomain'].value_counts()[
                dataset['R_emaildomain'].value_counts() <= 300].index),
                'R_emaildomain'] = 'Others'
    dataset['R_emaildomain'].fillna('NoInf', inplace=True)

# Transaction frequency per card and additional frequency features
train['card1_freq'] = train.groupby('card1')['card1'].transform('count')
test['card1_freq'] = test.groupby('card1')['card1'].transform('count')
train['addr1_freq'] = train.groupby('addr1')['addr1'].transform('count')
test['addr1_freq'] = test.groupby('addr1')['addr1'].transform('count')

# Interaction feature: TransactionAmt per card1 frequency
train['amt_per_card1'] = train['TransactionAmt'] / (train['card1_freq'] + 1e-9)
test['amt_per_card1'] = test['TransactionAmt'] / (test['card1_freq'] + 1e-9)

# Step 4: Feature Preparation
# Select top features based on importance and new features
numerical_cols = ['card1_freq', 'C5', 'D2', 'C2', 'LogTransactionAmt', 'D15', 'C1', 'addr1_freq', 'amt_per_card1']
categorical_cols = ['ProductCD', 'card6', 'new_card3', 'P_emaildomain', 'R_emaildomain', 'hour_warning']

# Handle categorical encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train[col] = train[col].astype(str)
    train_unique_values = train[col].unique()
    le.fit(train_unique_values)
    train[col] = le.transform(train[col])
    test[col] = test[col].astype(str)
    test[col] = test[col].apply(lambda x: x if x in train_unique_values else 'unknown')
    if 'unknown' not in le.classes_:
        le_classes = list(le.classes_)
        le_classes.append('unknown')
        le.classes_ = np.array(le_classes)
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Combine features
features = numerical_cols + categorical_cols
X = train[features]
y = train['isFraud']
X_test = test[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Step 5: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_train, y_train = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Step 6: Hyperparameter Tuning with GridSearchCV
param_grid = {
    'max_depth': [5, 7, 9],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5],
    'class_weight': ['balanced'],
    'ccp_alpha': [0.01, 0.05, 0.1]
}
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_dt = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV F1 Score: {grid_search.best_score_:.4f}")

# Step 7: Adaptive Thresholding
val_probs = best_dt.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)
optimal_threshold = thresholds[np.argmax(f1_scores)]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Step 8: Evaluate Model
train_probs = best_dt.predict_proba(X_train)[:, 1]
val_probs = best_dt.predict_proba(X_val)[:, 1]

train_pred = (train_probs >= optimal_threshold).astype(int)
val_pred = (val_probs >= optimal_threshold).astype(int)

# Calculate metrics
train_precision = precision_score(y_train, train_pred)
train_recall = recall_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred)
train_auc = roc_auc_score(y_train, train_probs)
train_accuracy = accuracy_score(y_train, train_pred)

val_precision = precision_score(y_val, val_pred)
val_recall = recall_score(y_val, val_pred)
val_f1 = f1_score(y_val, val_pred)
val_auc = roc_auc_score(y_val, val_probs)
val_accuracy = accuracy_score(y_val, val_pred)

# Print train and validation metrics
print("\nTrain Metrics:")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1: {train_f1:.4f}")
print(f"Train AUC: {train_auc:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")

print("\nTest (Validation) Metrics:")
print(f"Test Precision: {val_precision:.4f}")
print(f"Test Recall: {val_recall:.4f}")
print(f"Test F1: {val_f1:.4f}")
print(f"Test AUC: {val_auc:.4f}")
print(f"Test Accuracy: {val_accuracy:.4f}")

# Confusion Matrix
print("\nValidation Confusion Matrix:")
print(confusion_matrix(y_val, val_pred))

# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_dt.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Step 9: Generate Test Predictions for Submission
sample_submission = pd.read_csv(r"/content/sample_submission.csv")
test_probs = best_dt.predict_proba(X_test_scaled)[:, 1]

# Ensure test_probs aligns with sample_submission
if len(test_probs) != len(sample_submission):
    print(f"Warning: Test predictions length ({len(test_probs)}) does not match sample_submission length ({len(sample_submission)})")
    # Create a DataFrame with test IDs and predictions
    test_pred_df = pd.DataFrame({'TransactionID': test_ids, 'isFraud': test_probs})
    # Merge with sample_submission to ensure all IDs are included
    sample_submission = sample_submission.merge(test_pred_df, on='TransactionID', how='left', suffixes=('', '_new'))
    # Fill missing predictions with a default value (e.g., 0 for non-fraud)
    sample_submission['isFraud'] = sample_submission['isFraud_new'].fillna(0)
    sample_submission = sample_submission.drop(columns=['isFraud_new'])
else:
    sample_submission['isFraud'] = test_probs

# Save submission
sample_submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

Train shape: (14752, 434), Test shape: (12862, 433)
Best Parameters: {'ccp_alpha': 0.01, 'class_weight': 'balanced', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best CV F1 Score: 0.7830
Optimal Threshold: 0.8829

Train Metrics:
Train Precision: 0.7903
Train Recall: 0.7817
Train F1: 0.7860
Train AUC: 0.8630
Train Accuracy: 0.8581

Test (Validation) Metrics:
Test Precision: 0.7950
Test Recall: 0.7653
Test F1: 0.7799
Test AUC: 0.8574
Test Accuracy: 0.8559

Validation Confusion Matrix:
[[1497  164]
 [ 195  636]]

Feature Importance:
              Feature  Importance
3                  C2    0.454968
1                  C5    0.436282
8       amt_per_card1    0.063250
2                  D2    0.045500
0          card1_freq    0.000000
5                 D15    0.000000
4   LogTransactionAmt    0.000000
6                  C1    0.000000
7          addr1_freq    0.000000
9           ProductCD    0.000000
10              card6    0.000000
11          new_card3    0.000000
12  