<a href="https://colab.research.google.com/github/lavanya9739/credit-card-fraud-detection/blob/main/lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
import joblib


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [4]:
# Load the dataset
file_path = "/content/output_file.csv"  # Replace with your file path
data = pd.read_csv(file_path)



In [5]:
data_cleaned = data.drop(columns=["Account Number", "Card Number", "Transaction Time", "Transaction Date",
                                  "Merchant Number", "Approval Code"])


In [6]:
label_encoder = LabelEncoder()
categorical_cols = ["Transaction Type", "Currency Code", "Transaction Country", "Transaction City", "Fraud Label"]
for col in categorical_cols:
    data_cleaned[col] = label_encoder.fit_transform(data_cleaned[col])


In [7]:
X = data_cleaned.drop(columns=["Fraud Label"])
y = data_cleaned["Fraud Label"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
lgbm_model = LGBMClassifier(boosting_type='gbdt',
                            max_depth=7,
                            n_estimators=100,
                            learning_rate=0.1,
                            random_state=42)
lgbm_model.fit(X_train_scaled, y_train)


[LightGBM] [Info] Number of positive: 757, number of negative: 43
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1033
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.946250 -> initscore=2.868163
[LightGBM] [Info] Start training from score 2.868163


In [12]:
y_pred = lgbm_model.predict(X_test_scaled)
y_pred_proba = lgbm_model.predict_proba(X_test_scaled)[:, 1]

In [13]:
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_pred_proba)
}

In [14]:
print("LightGBM Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

LightGBM Evaluation Metrics:
Accuracy: 0.9400
Precision: 0.9495
Recall: 0.9895
F1 Score: 0.9691
AUC: 0.3984


In [15]:
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as scaler.pkl")


Scaler saved as scaler.pkl


In [16]:
joblib.dump(lgbm_model, "fraud_detection_lgbm_model.pkl")
print("Model saved as fraud_detection_lgbm_model.pkl")


Model saved as fraud_detection_lgbm_model.pkl


In [17]:
# Example new data with all required features
new_data = pd.DataFrame([{
    "Transaction Type": "Refund",
    "Currency Code": "INR",
    "Transaction Country": "IN",
    "Transaction City": "Hyderabad",
    "Credit Limit": 143194.285,  # Replace with an actual value
    "Merchant Category Code": 4044,  # Replace with an actual value
    "Open to Buy": 33994.67546,  # Replace with an actual value
    "Transaction Amount": 4487.46083  # Replace with an actual value
}])


In [18]:
for col in categorical_cols[:-1]:  # Skip "Fraud Label" as it's the target
    if col in new_data.columns:
        new_data[col] = label_encoder.fit_transform(new_data[col])

In [19]:
new_data = new_data[X.columns]

In [20]:
new_data_scaled = scaler.transform(new_data)


In [22]:
fraud_prediction = lgbm_model.predict(new_data_scaled)
print("Fraud Prediction:", "Fraud" if fraud_prediction[0] == 1 else "Not Fraud")

Fraud Prediction: Fraud
