In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load Dataset

In [None]:
df = pd.read_csv("Finance Dataset.csv")
df.head()

# Data Preprocessing and cleaning

In [None]:
df.info()

In [None]:
print("Missing values on each column : \n", df.isnull().sum())

In [None]:
print("Duplicates values : \n", df.duplicated().sum())

In [None]:
numeric_columns = ["Customer_Age", "Transaction_Amount", "Balance_After_Transaction", "Credit_Score", "Income"]
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

In [None]:
categorical_columns = ["Gender", "Account_Type", "Transaction_Type", "Location", "Merchant_Category", "Card_Type", "Loan_Approved"]
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
print("Re-checking Missing values after handling :\n", df.isnull().sum())

In [None]:
df.drop_duplicates(inplace=True)
print("Check, there is no duplicates remians :\n", df.duplicated().sum())

In [None]:
plt.figure(figsize=(15, 6))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(1, len(numeric_columns), i) 
    sns.boxplot(y=df[col], color='yellow')
    plt.title(f"{col} - Outliers")
    
plt.tight_layout(pad=3.0) 
plt.show()

In [None]:
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])  
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

In [None]:
plt.figure(figsize=(15, 6))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(1, len(numeric_columns), i) 
    sns.boxplot(y=df[col], color='yellow')
    plt.title(f"{col} - Outliers")
    
plt.tight_layout(pad=3.0) 
plt.show()

In [None]:
skewness = df[numeric_columns].skew()  
kurtosis = df[numeric_columns].kurt() 
print("Skewness for each column:")
print(skewness)
print("\nKurtosis for each column:")
print(kurtosis)

# Feature Engineering and EDA

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday

df.drop('Date', axis=1, inplace=True)

In [None]:
le = LabelEncoder()
df['Loan_Approved'] = le.fit_transform(df['Loan_Approved'])

In [None]:
plt.figure(figsize=(6,4))
df['Loan_Approved'].value_counts().plot(kind='bar', color=['yellow', 'red'])
plt.title('Loan Approval Distribution')
plt.xlabel('Loan_Approved')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
correlation_matrix = df[numeric_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(2, 3, i)
    plt.hist(df[col], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='Loan_Approved', y=col, data=df, palette='Set2')
    plt.title(f'{col} vs Loan_Approved')
    plt.xlabel('Loan Approved')
    plt.ylabel(col)
plt.tight_layout()
plt.show()

In [None]:
categorical_columns = ['Gender', 'Account_Type', 'Merchant_Category']

plt.figure(figsize=(12, 10))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(2, 3, i)
    
    if col in df.columns:
        loan_approved_count = df.groupby([col, 'Loan_Approved']).size().unstack().fillna(0)
        
        loan_approved_count.plot(kind='bar', stacked=False, color=['skyblue', 'salmon'], ax=plt.gca())
        
        plt.title(f'{col} vs Loan_Approved')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
    else:
        print(f"Column '{col}' not found in the dataset.")
        
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))

fraud_loan_approved_count = df.groupby(['Fraud_Flag', 'Loan_Approved']).size().unstack().fillna(0)
fraud_loan_approved_count.plot(kind='bar', stacked=False, color=['magenta', 'cyan'], ax=plt.gca())

plt.title('Fraud Flag vs Loan Approval')
plt.xlabel('Fraud Flag')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()


# Machine Learning Alogrithms

In [None]:
df = pd.get_dummies(df, columns = ['Gender', 'Account_Type', 'Transaction_Type', 'Location', 'Merchant_Category', 'Card_Type'], drop_first=True)

In [None]:
X = df.drop('Loan_Approved', axis=1)  
y = df['Loan_Approved']  

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_res).value_counts())

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)  
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Logistic Regression): {accuracy:.2f}")

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred))
print("Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Random Forest): {accuracy:.2f}")

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report (KNN) : \n", classification_report(y_test, y_pred))
print("Confusion Matrix (KNN) : \n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (KNN) : {accuracy : .2f}")

In [None]:
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report (Decision Tree) : \n", classification_report(y_test, y_pred))
print("Confusion Matrix (Decision Tree) : \n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Decision Tree) : {accuracy : .2f}")

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report (XGBoost) : \n", classification_report(y_test, y_pred))
print("Confusion Matrix (XGBoost) : \n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (XGBoost) : {accuracy : .2f}")

In [None]:
model_names = ["KNN", "Decision Tree", "XGBoost", "Random Forest", "Logistic Regression"]
accuracies = [0.69, 0.63, 0.75, 0.76, 0.71] 

plt.figure(figsize=(8, 5))
plt.bar(model_names, accuracies, color=['blue', 'green', 'red', 'yellow', 'magenta'])
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.ylim(0, 1) 
plt.show()
