In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import gc

# Step 1: Load the Data
data = pd.read_csv(r"/content/creditcard.csv")

# Step 1.1: Check for target column name (assuming 'Class' or 'isFraud')
target_col = 'isFraud' if 'isFraud' in data.columns else 'Class'
if target_col not in data.columns:
    raise ValueError("Target column ('isFraud' or 'Class') not found in the dataset!")

# Step 1.2: Handle NaNs in target column
if data[target_col].isnull().any():
    print(f"Found {data[target_col].isnull().sum()} NaN values in {target_col}. Dropping rows with NaN in target.")
    data = data.dropna(subset=[target_col])

# Step 1.3: Optimize memory by downcasting numerical columns
def downcast_df(df):
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

data = downcast_df(data)

# Step 1.4: Reduce training data size to 30% of original
data = data.sample(frac=0.3, random_state=42)

# Split into train and test
train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data[target_col])

# Store TransactionID or index for test set alignment
test_ids = test.index if 'TransactionID' not in test.columns else test['TransactionID'].copy()

# Clean up memory
gc.collect()

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Step 2: Minimal Preprocessing
# 2.1: Remove features with high missing values (>80%)
missing_percent = train.isnull().mean()
high_missing_cols = missing_percent[missing_percent > 0.8].index.tolist()
if 'TransactionID' in high_missing_cols:
    high_missing_cols.remove('TransactionID')
train.drop(columns=high_missing_cols, inplace=True)
test.drop(columns=high_missing_cols, inplace=True)

# 2.2: Define minimal categorical columns
potential_categorical_cols = ['ProductCD']
categorical_cols = [col for col in potential_categorical_cols if col in train.columns]
missing_cols = [col for col in potential_categorical_cols if col not in train.columns]
if missing_cols:
    print(f"Note: The following columns are missing in the dataset: {missing_cols}")

# 2.3: Handle missing values minimally
numerical_cols = train.select_dtypes(include=['float32', 'float64', 'int8', 'int16', 'int32']).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)
if 'TransactionID' in numerical_cols:
    numerical_cols.remove('TransactionID')

for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(test[col].median())

for col in categorical_cols:
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)

# Step 3: Minimal Feature Selection
# Use only a few raw features
numerical_cols = [col for col in ['Amount', 'V1', 'V2', 'V3'] if col in train.columns]
feature_selection_categorical_cols = [col for col in ['ProductCD'] if col in train.columns]

# Step 4: Optimized Feature Preparation for SVM
# 4.1: Encode categorical features
label_encoders = {}
for col in feature_selection_categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# 4.2: Combine features
features = numerical_cols + feature_selection_categorical_cols
features = [f for f in features if f in train.columns]  # Final safety check

X = train[features]
y = train[target_col]
X_test_full = test[features]  # Prepare test set features

# 4.3: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_full)  # Scale test set

# 4.4: Create train/val/test split (70/15/15) without SMOTE
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 5: Train Simpler SVM Model
svm = SVC(kernel='linear',  # Simpler kernel
          class_weight='balanced',
          probability=True,
          random_state=42,
          C=0.01,  # Stronger regularization
          max_iter=500)  # Very low iterations for underfitting

svm.fit(X_train, y_train)

# Step 6: Use Default Threshold (0.5)
optimal_threshold = 0.5
print(f"Using Default Threshold: {optimal_threshold:.4f}")

# Step 7: Evaluate on Test Set
test_probs = svm.predict_proba(X_test)[:, 1]
test_pred = (test_probs >= optimal_threshold).astype(int)

test_precision = precision_score(y_test, test_pred)
test_recall = recall_score(y_test, test_pred)
test_f1 = f1_score(y_test, test_pred)
test_auc = roc_auc_score(y_test, test_probs)
test_accuracy = accuracy_score(y_test, test_pred)

print("\nTest Set Performance:")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1: {test_f1:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 8: Evaluate on Train Set
train_probs = svm.predict_proba(X_train)[:, 1]
train_pred = (train_probs >= optimal_threshold).astype(int)

train_precision = precision_score(y_train, train_pred)
train_recall = recall_score(y_train, train_pred)
train_f1 = f1_score(y_train, train_pred)
train_auc = roc_auc_score(y_train, train_probs)
train_accuracy = accuracy_score(y_train, train_pred)

print("\nTrain Set Performance:")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1: {train_f1:.4f}")
print(f"Train AUC: {train_auc:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")

# Step 9: Generate Predictions for Submission
test_full_probs = svm.predict_proba(X_test_scaled)[:, 1]

# Create a DataFrame with TransactionID (or index) and predictions
test_pred_df = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_full_probs
})

# Create a sample submission structure
sample_submission = pd.DataFrame({
    'TransactionID': test_ids
})
sample_submission = sample_submission.merge(test_pred_df, on='TransactionID', how='left')
# Fill missing predictions with mean probability
mean_prob = test_full_probs.mean()
sample_submission['isFraud'] = sample_submission['isFraud'].fillna(mean_prob)
sample_submission = sample_submission[['TransactionID', 'isFraud']]  # Keep only required columns

# Save submission
sample_submission.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")

Found 1 NaN values in Class. Dropping rows with NaN in target.
Train shape: (19085, 31), Test shape: (4772, 31)
Note: The following columns are missing in the dataset: ['ProductCD', 'card4', 'card5', 'card6', 'P_emaildomain', 'R_emaildomain']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['hour'] = (train['Time'] // 3600) % 24
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['day_of_week'] = (train['Time'] // (3600 * 24)) % 7
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['LogTransactionAmt'] = np.log1p(train[col])


Optimal Threshold: 0.6375

Test Set Performance:
Test Precision: 0.9601
Test Recall: 0.9788
Test F1: 0.9694
Test AUC: 0.9978
Test Accuracy: 0.9897

Train Set Performance:
Train Precision: 0.9529
Train Recall: 0.9853
Train F1: 0.9688
Train AUC: 0.9965
Train Accuracy: 0.9894

Submission file 'submission.csv' created successfully!
