In [0]:
import pandas as pd
df = spark.table("workspace.default.credit_score")

In [0]:
# ==========================================================
# 🔹 Step 1: Import Libraries
# ==========================================================
import pandas as pd
from pyspark.sql import functions as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemb le import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ==========================================================
# 🔹 Step 2: Load Spark Table
# ==========================================================
df = spark.table("workspace.default.credit_score")

# ==========================================================
# 🔹 Step 3: Compute Credit Score
# ==========================================================
df_score = df.withColumn(
    "credit_score",
    (
        850
        - (F.col("amount") * 10)
        - (F.when(F.col("is_international") == True, 50).otherwise(0))
        - (F.when(F.col("label_fraud") == True, 150).otherwise(0))
        + (F.when(F.col("is_chip") == True, 20).otherwise(0))
        + (F.when(F.col("is_contactless") == True, 10).otherwise(0))
    ).cast("int")
)

# Cap between 400 and 850
df_score = df_score.withColumn(
    "credit_score",
    F.when(F.col("credit_score") > 850, 850)
     .when(F.col("credit_score") < 400, 400)
     .otherwise(F.col("credit_score"))
)

# Convert to Pandas
pdf = df_score.toPandas()

# ==========================================================
# 🔹 Step 4: Preprocessing
# ==========================================================
# 1️⃣ Select features and target
features = ["amount", "is_international", "is_chip", "is_contactless", "credit_score"]
target = "label_fraud"

# 2️⃣ Handle missing values (if any)
pdf[features] = pdf[features].fillna(0)  # fill numeric/boolean features with 0
pdf[target] = pdf[target].fillna(False)  # fill target boolean with False

# 3️⃣ Convert boolean columns to integer (0/1)
pdf[features] = pdf[features].astype(int)

# 4️⃣ Encode target label
le = LabelEncoder()
pdf[target + "_encoded"] = le.fit_transform(pdf[target])

# 5️⃣ Optional: Feature scaling (for some models, not strictly needed for Random Forest)
scaler = StandardScaler()
pdf[features] = scaler.fit_transform(pdf[features])

# Check preprocessed data
print(pdf[features + [target, target+"_encoded"]].head())

# ==========================================================
# 🔹 Step 5: Split Dataset
# ==========================================================
X = pdf[features]
y = pdf[target + "_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


     amount  is_international  ...  label_fraud  label_fraud_encoded
0 -0.080614         -0.294371  ...        False                    0
1  5.410233         -0.294371  ...        False                    0
2 -0.538184          3.397076  ...        False                    0
3 -0.538184          3.397076  ...        False                    0
4  0.376957         -0.294371  ...        False                    0

[5 rows x 7 columns]
Training samples: 400000, Test samples: 100000


In [0]:
# ==========================================================
# 🔹 Step 1: Import Libraries
# ==========================================================
import pandas as pd
from pyspark.sql import functions as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ==========================================================
# 🔹 Step 2: Load Spark Table
# ==========================================================
df = spark.table("workspace.default.credit_score")

# ==========================================================
# 🔹 Step 3: Compute credit_score dynamically
# ==========================================================
df_score = df.withColumn(
    "credit_score",
    (
        850
        - (F.col("amount") * 10)
        - (F.when(F.col("is_international") == True, 50).otherwise(0))
        - (F.when(F.col("label_fraud") == True, 150).otherwise(0))
        + (F.when(F.col("is_chip") == True, 20).otherwise(0))
        + (F.when(F.col("is_contactless") == True, 10).otherwise(0))
    ).cast("int")
)

# Cap between 400 and 850
df_score = df_score.withColumn(
    "credit_score",
    F.when(F.col("credit_score") > 850, 850)
     .when(F.col("credit_score") < 400, 400)
     .otherwise(F.col("credit_score"))
)

# Convert to Pandas
pdf = df_score.toPandas()

# ==========================================================
# 🔹 Step 4: Preprocessing
# ==========================================================
# Features for fraud prediction
features = ["amount", "is_international", "is_chip", "is_contactless", "credit_score"]

# Ensure boolean columns are integers
pdf[features] = pdf[features].astype(int)

# Target variable
target = "label_fraud"
le = LabelEncoder()
pdf[target + "_encoded"] = le.fit_transform(pdf[target])

# ==========================================================
# 🔹 Step 5: Split Dataset
# ==========================================================
X = pdf[features]
y = pdf[target + "_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==========================================================
# 🔹 Step 6: Model Training
# ==========================================================
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# ==========================================================
# 🔹 Step 7: Model Evaluation
# ==========================================================
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["Non-Fraud","Fraud"])

print(f"✅ Accuracy: {accuracy*100:.2f}%")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)

# ==========================================================
# 🔹 Step 8: Prediction Function
# ==========================================================
def predict_fraud(amount, is_international, is_chip, is_contactless):
    credit_score = 850 \
        - (amount * 10) \
        - (50 if is_international else 0) \
        + (20 if is_chip else 0) \
        + (10 if is_contactless else 0)
    credit_score = max(400, min(850, credit_score))
    
    input_df = pd.DataFrame({
        "amount": [amount],
        "is_international": [int(is_international)],
        "is_chip": [int(is_chip)],
        "is_contactless": [int(is_contactless)],
        "credit_score": [credit_score]
    })
    
    pred_encoded = rf.predict(input_df)[0]
    pred_label = le.inverse_transform([pred_encoded])[0]
    pred_prob = rf.predict_proba(input_df)[0][1]
    
    return pred_label, pred_prob, credit_score

# ==========================================================
# 🔹 Step 9: Test Prediction
# ==========================================================
amount = 25
is_international = True
is_chip = True
is_contactless = False

pred_label, pred_prob, credit_score = predict_fraud(amount, is_international, is_chip, is_contactless)
print(f"Credit Score: {credit_score}")
print(f"Predicted Fraudulent Transaction: {pred_label}")
print(f"Fraud Probability: {pred_prob*100:.2f}%")


✅ Accuracy: 100.00%
Confusion Matrix:
[[99028     0]
 [    0   972]]
Classification Report:
              precision    recall  f1-score   support

   Non-Fraud       1.00      1.00      1.00     99028
       Fraud       1.00      1.00      1.00       972

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000

Credit Score: 570
Predicted Fraudulent Transaction: False
Fraud Probability: 1.00%
