In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.impute import KNNImputer

## 1- Loading Data

In [2]:
path = "../data/fraud_detection_dataset.csv"
data = pd.read_csv(path)

## 2- Inspecting Data

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe()

## 4- Converting Data Types

In [3]:
# Date conversions
data['account_open_date'] = pd.to_datetime(data['account_open_date'])
data['earliest_credit_account'] = pd.to_datetime(data['earliest_credit_account'])
data['recent_trade_activity'] = pd.to_datetime(data['recent_trade_activity'])

In [4]:
# Categorical conversions
data['location'] = data['location'].astype('category')
data['occupation'] = data['occupation'].astype('category')

In [5]:
# Boolean to 0,1 convention 
data['multiple_applications_short_time_period'] = data['multiple_applications_short_time_period'].astype(int)
data['watchlist_blacklist_flag'] = data['watchlist_blacklist_flag'].astype(int)
data['public_records_flag'] = data['public_records_flag'].astype(int)
data['applications_submitted_during_odd_hours'] = data['applications_submitted_during_odd_hours'].astype(int)
data['payment_methods_high_risk'] = data['payment_methods_high_risk'].astype(int)
data['charge_off_status'] = data['charge_off_status'].astype(int)

## 3- Handling Missing Values

In [6]:
# Summary of missing values in each column
missing_summary = data.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
print(missing_summary)

unusual_submission_pattern       910
number_of_delinquent_accounts    700
avg_balance_last_12months        350
fico_score                       210
dtype: int64


### 3.1 - `unusual_submission_pattern`
- `unusual_submission_pattern` has 910 missing values. This is about 13% of the data. So if we remove those that would affect the dataset a lot. So we choose to impute this with some value by analyzing its relationship with other variables 

In [None]:
# 1. Visualizing categorical variables against 'unusual_submission_pattern'

# Convert 'unusual_submission_pattern' to a string (optional but useful for visualization)
data['unusual_submission_pattern_str'] = data['unusual_submission_pattern'].astype(str)

features = ['multiple_applications_short_time_period', 'watchlist_blacklist_flag', 'public_records_flag','applications_submitted_during_odd_hours','payment_methods_high_risk']

for feature in features:
    plt.figure(figsize=(5, 3))
    sns.countplot(data=data, x=feature, hue='unusual_submission_pattern_str')
    plt.title(f'Relationship between {feature} and unusual_submission_pattern')
    plt.show()

In [None]:
# 2. Chi-Square Test for `multiple_applications_short_time_period` ,`watchlist_blacklist_flag` and `public_records_flag`
for feature in features:
    contingency_table = pd.crosstab(data[feature], data['unusual_submission_pattern_str'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test for {feature}:\n p-value: {p_value}")

- According to the above analysis due to features `multiple_applications_short_time_period`, `watchlist_blacklist_flag`, and `public_records_flag`, indicated a significant chi-square test results, KNN imputation was chosen for handling missing values in the `unusual_submission_pattern` feature due to its ability to capture complex relationships

### 3.1.1 Using KNN to impute missing values for `unusual_submission_pattern`

In [8]:
features = ['multiple_applications_short_time_period', 
            'watchlist_blacklist_flag', 
            'public_records_flag', 
            'applications_submitted_during_odd_hours', 
            'payment_methods_high_risk']
target = 'unusual_submission_pattern'
# Check the distribution of the target variable
target_counts = data['unusual_submission_pattern'].value_counts()
target_percentages = data['unusual_submission_pattern'].value_counts(normalize=True) * 100

# Check the distribution of each feature
feature_counts = {feature: data[feature].value_counts() for feature in features}
feature_percentages = {feature: data[feature].value_counts(normalize=True) * 100 for feature in features}

# Display the results
print("Target distribution (counts):")
print(target_counts)
print("\nTarget distribution (percentages):")
print(target_percentages)

print("\nFeature distributions (counts):")
for feature in features:
    print(f"\n{feature} distribution (counts):")
    print(feature_counts[feature])
    
print("\nFeature distributions (percentages):")
for feature in features:
    print(f"\n{feature} distribution (percentages):")
    print(feature_percentages[feature])

Target distribution (counts):
unusual_submission_pattern
False    4822
True     1268
Name: count, dtype: int64

Target distribution (percentages):
unusual_submission_pattern
False    79.178982
True     20.821018
Name: proportion, dtype: float64

Feature distributions (counts):

multiple_applications_short_time_period distribution (counts):
multiple_applications_short_time_period
0    5187
1    1813
Name: count, dtype: int64

watchlist_blacklist_flag distribution (counts):
watchlist_blacklist_flag
0    6120
1     880
Name: count, dtype: int64

public_records_flag distribution (counts):
public_records_flag
0    5587
1    1413
Name: count, dtype: int64

applications_submitted_during_odd_hours distribution (counts):
applications_submitted_during_odd_hours
0    4800
1    2200
Name: count, dtype: int64

payment_methods_high_risk distribution (counts):
payment_methods_high_risk
0    5343
1    1657
Name: count, dtype: int64

Feature distributions (percentages):

multiple_applications_short_tim

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error,mean_absolute_error

features = ['multiple_applications_short_time_period', 
            'watchlist_blacklist_flag', 
            'public_records_flag', 
            'applications_submitted_during_odd_hours', 
            'payment_methods_high_risk']
target = 'unusual_submission_pattern'

# Separate data with complete target values for validation
data[target] = data[target].map({True: 1, False: 0})
complete_data = data.dropna(subset=[target])

# Split complete data for cross-validation
train_data, valid_data = train_test_split(complete_data, test_size=0.3, random_state=42)

# Initialize lists to store mean squared errors for different K values
k_values = range(1, 21)
mae_scores = []

for k in k_values:
    # Set up KNN Imputer with the current K value
    knn_imputer = KNNImputer(n_neighbors=k)
    
    # Impute the training data and transform both train and validation sets
    train_imputed = knn_imputer.fit_transform(train_data[features + [target]])
    valid_imputed = knn_imputer.transform(valid_data[features + [target]])
    
    # Convert back to DataFrame for analysis
    train_imputed_df = pd.DataFrame(train_imputed, columns=features + [target])
    valid_imputed_df = pd.DataFrame(valid_imputed, columns=features + [target])
    
    # Calculate MSE between the original and imputed target values in validation data
    mae = mean_absolute_error(valid_data[target], valid_imputed_df[target])
    mae_scores.append(mae)

# Find the best K with the lowest MSE
best_k = k_values[mae_scores.index(min(mae_scores))]
print(f"Best K value: {best_k}")


In [None]:
# Use a larger k value range (e.g., from 2 to 10) and check accuracy
k_values = range(2, 11)  # You can experiment with other ranges
accuracy_scores = {k: [] for k in k_values}

for k in k_values:
    knn_imputer = KNNImputer(n_neighbors=k)
    fold_accuracies = []
    
    for train_index, valid_index in kf.split(complete_data):
        train_data, valid_data = complete_data.iloc[train_index], complete_data.iloc[valid_index]
        
        # Fit the imputer on training data and transform both sets
        train_imputed = knn_imputer.fit_transform(train_data[features + [target]])
        valid_imputed = knn_imputer.transform(valid_data[features + [target]])
        
        # Convert back to DataFrame for target comparison
        valid_imputed_df = pd.DataFrame(valid_imputed, columns=features + [target])
        
        # Calculate accuracy of imputed vs. original target values
        fold_accuracy = accuracy_score(valid_data[target], valid_imputed_df[target].round())
        fold_accuracies.append(fold_accuracy)
    
    # Average accuracy across folds for this k
    accuracy_scores[k] = np.mean(fold_accuracies)

# Identify the best k value with highest accuracy
best_k = max(accuracy_scores, key=accuracy_scores.get)
print(f"Best K value by accuracy: {best_k}, Accuracy: {accuracy_scores[best_k]:.4f}")

In [None]:
# Define the features and target
features = ['multiple_applications_short_time_period', 
            'watchlist_blacklist_flag', 
            'public_records_flag', 
            'applications_submitted_during_odd_hours', 
            'payment_methods_high_risk']
target = 'unusual_submission_pattern'

# Convert 'unusual_submission_pattern' to numerical (1 for True, 0 for False, NaN remains NaN)
data[target] = data[target].map({True: 1, False: 0})

# Select the rows where 'unusual_submission_pattern' is not NaN for fitting the imputer
data_for_imputation = data[features + [target]]

# Initialize the KNN Imputer with k=5 
knn_imputer = KNNImputer(n_neighbors=5)

# Apply the KNN imputation
data_imputed = knn_imputer.fit_transform(data_for_imputation)

# Convert the imputed numpy array back to a DataFrame
data_imputed_df = pd.DataFrame(data_imputed, columns=features + [target])

# Replace the original 'unusual_submission_pattern' column with the imputed values
data['unusual_submission_pattern'] = data_imputed_df[target]

# Summary of missing values in each column
missing_summary = data.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
print(missing_summary)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the distribution of the target variable
sns.countplot(x='unusual_submission_pattern', data=data)
plt.title('Distribution of Target Variable')
plt.show()

# Check for imbalance in the features as well
for feature in features:
    sns.countplot(x=feature, data=data)
    plt.title(f'Distribution of {feature}')
    plt.show()

In [None]:
data.head()

In [None]:
data.info()