In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from deap import base, creator, tools, algorithms
application_df = pd.read_csv('data/application_record.csv')
credit_record_df = pd.read_csv('data/credit_record.csv')

In [2]:
print(application_df.isnull().sum())
print(credit_record_df.isnull().sum())

# Drop Duplicates
application_df = application_df.drop_duplicates()
credit_record_df = credit_record_df.drop_duplicates()

# Drop Unnecessary Columns due to lack of impact
if 'FLAG_MOBIL' in application_df.columns:
    # Drop Unnecessary Columns due to lack of impact
    application_df = application_df.drop(columns=['FLAG_MOBIL'])
else:
    print("FLAG_MOBIL has been already dropped")

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64
ID                0
MONTHS_BALANCE    0
STATUS            0
dtype: int64


In [3]:
merged_df = pd.merge(application_df, credit_record_df, on='ID', how='inner')


In [4]:
# Label Encoding
encoder = LabelEncoder()
for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE']:
    merged_df[col] = encoder.fit_transform(merged_df[col])

In [5]:
scaler = StandardScaler()
numerical_cols = ['AMT_INCOME_TOTAL', 'AGE', 'YEARS_EMPLOYED']


In [6]:
# Convert DAYS_BIRTH to AGE (in years)
merged_df['AGE'] = -merged_df['DAYS_BIRTH'] // 365

# Convert DAYS_EMPLOYED to YEARS_EMPLOYED (in years)
# Handling anomalies in DAYS_EMPLOYED (e.g., 365243 used as a placeholder for missing values)
merged_df['DAYS_EMPLOYED'] = merged_df['DAYS_EMPLOYED'].replace(365243, np.nan)
merged_df['YEARS_EMPLOYED'] = -merged_df['DAYS_EMPLOYED'] // 365

# Drop the original DAYS_BIRTH and DAYS_EMPLOYED columns if not needed
merged_df.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

In [24]:
print(merged_df['STATUS'].value_counts())
categorical_columns = ['CODE_GENDER', 'FLAG_OWN_CAR',
                     'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
                     'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                     'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col])
    label_encoders[col] = le
merged_df.to_csv('cleaned.csv', index=False)

STATUS
C    329536
0    290654
X    145950
1      8747
5      1527
2       801
3       286
4       214
Name: count, dtype: int64


In [17]:
# # Step 3: Data Splitting
# X = merged_df.drop(columns=['STATUS'])
# y = merged_df['STATUS']
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
#


In [25]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random

data = pd.read_csv("cleaned.csv")


#STANDARD GENETIC ALGORITHM
# Load the dataset

# Assuming the last column is the target variable
X = data.drop('STATUS', axis=1)  # Features
y = data['STATUS']               # Target

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Parameters for Genetic Algorithm
population_size = 20
num_generations = 50
mutation_rate = 0.1
crossover_rate = 0.8

# Fitness function: Validation accuracy of Decision Tree
def fitness_function(chromosome):
    selected_features = [bool(bit) for bit in chromosome]
    X_train_selected = X_train.iloc[:, selected_features]
    X_val_selected = X_val.iloc[:, selected_features]

    if X_train_selected.shape[1] == 0:  # Avoid empty feature sets
        return 0

    model = DecisionTreeClassifier()
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_val_selected)
    return accuracy_score(y_val, predictions)

# Initialize population
population = [np.random.randint(2, size=X.shape[1]).tolist() for _ in range(population_size)]

# Genetic Algorithm main loop
for generation in range(num_generations):
    # Calculate fitness for each individual
    fitness_scores = [fitness_function(chromosome) for chromosome in population]

    # Select parents based on fitness (roulette wheel selection)
    fitness_sum = sum(fitness_scores)
    if fitness_sum == 0:
        continue
    probabilities = [score / fitness_sum for score in fitness_scores]
    parents = random.choices(population, probabilities, k=population_size)

    # Create next generation
    next_generation = []
    for i in range(0, population_size, 2):
        parent1, parent2 = parents[i], parents[i + 1]

        # Crossover
        if random.random() < crossover_rate:
            crossover_point = random.randint(1, len(parent1) - 1)
            child1 = parent1[:crossover_point] + parent2[crossover_point:]
            child2 = parent2[:crossover_point] + parent1[crossover_point:]
        else:
            child1, child2 = parent1, parent2

        # Mutation
        child1 = [bit if random.random() > mutation_rate else 1 - bit for bit in child1]
        child2 = [bit if random.random() > mutation_rate else 1 - bit for bit in child2]

        next_generation.extend([child1, child2])

    # Elitism: Carry forward the best individual
    best_individual = population[np.argmax(fitness_scores)]
    next_generation[random.randint(0, population_size - 1)] = best_individual

    population = next_generation

# Get the best chromosome from the final population
fitness_scores = [fitness_function(chromosome) for chromosome in population]
best_chromosome = population[np.argmax(fitness_scores)]
selected_features = [i for i, bit in enumerate(best_chromosome) if bit == 1]

print("Selected Features (indices):", selected_features)
print("Selected Features (names):", X.columns[selected_features].tolist())

# Evaluate the performance on the test set
X_train_selected = X_train.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

final_model = DecisionTreeClassifier()
final_model.fit(X_train_selected, y_train)
test_predictions = final_model.predict(X_test_selected)

test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy with Selected Features:", test_accuracy)

Selected Features (indices): [0, 1, 2, 3, 4, 5, 6, 13, 14, 15, 16, 17]
Selected Features (names): ['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'MONTHS_BALANCE', 'AGE', 'YEARS_EMPLOYED']
Test Accuracy with Selected Features: 0.8606182173532891
