# Importing required libraries


In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib



# Load the feature-engineered dataset


In [22]:
df = pd.read_csv('../data/feature_engineered_data.csv')

In [None]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Use Label Encoding for high-cardinality categorical columns
label_encoder = LabelEncoder()

In [32]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Check if there are any categorical columns
if len(categorical_columns) > 0:
    # Use Label Encoding for high-cardinality categorical columns
    label_encoder = LabelEncoder()

    # Loop through the categorical columns and apply LabelEncoder where appropriate
    for col in categorical_columns:
        unique_vals = df[col].nunique()
        if unique_vals > 100:  # Threshold for high cardinality (adjust if necessary)
            print(f"Encoding high-cardinality column: {col} with {unique_vals} unique values.")
            df[col] = label_encoder.fit_transform(df[col])
        else:
            # For low-cardinality columns, use One-Hot Encoding
            df = pd.get_dummies(df, columns=[col], drop_first=True)
else:
    print("No categorical columns found.")

Encoding high-cardinality column: TransactionId with 95662 unique values.
Encoding high-cardinality column: BatchId with 94809 unique values.
Encoding high-cardinality column: AccountId with 3633 unique values.
Encoding high-cardinality column: SubscriptionId with 3627 unique values.
Encoding high-cardinality column: CustomerId with 3742 unique values.
Encoding high-cardinality column: TransactionStartTime with 94556 unique values.


In [33]:
# Define target and features
X = df.drop(columns='FraudResult')  # Assuming 'FraudResult' is the target variable
y = df['FraudResult']

In [34]:
# Standardize the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [37]:
# Ensure target is categorical
y = df['FraudResult']

# Check the unique values of the target to ensure it's not continuous
print(y.unique())

[-0.04496219 22.24090895]


In [38]:
# Assuming 'FraudResult' has a threshold (for example, above 0.5 is fraud, below is not):
y_binarized = (y > 0.5).astype(int)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binarized, test_size=0.3, random_state=42)


# Train and Evaluate Models


In [44]:
# Fit models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
        "Decision Tree": DecisionTreeClassifier(),

}



In [45]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    

Model: Logistic Regression
Accuracy: 0.9981532457576919
ROC AUC Score: 0.6515413264771748
Confusion Matrix:
 [[28629    14]
 [   39    17]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     28643
           1       0.55      0.30      0.39        56

    accuracy                           1.00     28699
   macro avg       0.77      0.65      0.69     28699
weighted avg       1.00      1.00      1.00     28699

Model: Random Forest
Accuracy: 0.9995470225443395
ROC AUC Score: 0.9552174926808344
Confusion Matrix:
 [[28635     8]
 [    5    51]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     28643
           1       0.86      0.91      0.89        56

    accuracy                           1.00     28699
   macro avg       0.93      0.96      0.94     28699
weighted avg       1.00      1.00      1.00     28699

Model: Decision Tree
Acc

In [48]:
# Save the best model (for API serving)
best_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_model.pkl')

print("Modelling Completed.")

Modelling Completed.
