In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from yellowbrick.regressor import CooksDistance
from yellowbrick.features import ParallelCoordinates

# Step 1: Data Preprocessing
# Load the dataset
fraud_data = pd.read_csv("/content/Fraud_check.csv")

In [14]:
# Explore the dataset
print(fraud_data.head())
print(fraud_data.info())

  Undergrad Marital.Status  Taxable.Income  City.Population  Work.Experience  \
0        NO         Single           68833            50047               10   
1       YES       Divorced           33700           134075               18   
2        NO        Married           36925           160205               30   
3       YES         Single           50190           193264               15   
4        NO        Married           81002            27533               28   

  Urban  
0   YES  
1   YES  
2   YES  
3   YES  
4    NO  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    ob

In [3]:
# Preprocess the data
# Assuming 'taxable_income' <= 30000 as risky (1) and others as good (0)
fraud_data['risk'] = fraud_data['Taxable.Income'].apply(lambda x: 1 if x <= 30000 else 0)


In [4]:
# Drop the original 'Taxable.Income' column
fraud_data.drop(columns=['Taxable.Income'], inplace=True)

In [5]:
# Encode categorical variables
fraud_data_encoded = pd.get_dummies(fraud_data, columns=['Undergrad', 'Marital.Status', 'Urban'], drop_first=True)


In [6]:
# Split the data into features (X) and target (y)
X = fraud_data_encoded.drop(columns=['risk'])
y = fraud_data_encoded['risk']


In [7]:
# Step 2: Model Training
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [9]:
# Step 3: Model Evaluation
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7333333333333333


In [11]:
# Generate confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[88  6]
 [26  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.94      0.85        94
           1       0.00      0.00      0.00        26

    accuracy                           0.73       120
   macro avg       0.39      0.47      0.42       120
weighted avg       0.60      0.73      0.66       120

