In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Apply outlier removal on numerical columns in train and test data
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Drop the LoanID column
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define the SVM model
svm = SVC()

# Reduce the hyperparameter grid size
param_dist = {
    'C': [0.1, 1],  # Fewer values for C
    'kernel': ['linear', 'rbf'],  # Fewer kernels
    'gamma': ['scale'],  # Use only 'scale' for simplicity
}

# Randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_dist, n_iter=5, cv=2, 
                                   scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation accuracy:", random_search.best_score_)

# Train the final model with the best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Make predictions on the test set
test_predictions = best_model.predict(test_data)

# Prepare the sample submission file
sample_submission = pd.DataFrame({'LoanID': test_data.index, 'Default': test_predictions})
sample_submission.to_csv('sample_submission_svm.csv', index=False)
print("Submission file created: sample_submission.csv")


Train data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64
Test data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64




Best parameters: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}
Best cross-validation accuracy: 0.8839194473127057
Validation Accuracy: 0.88
Submission file created: sample_submission.csv


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Apply outlier removal on numerical columns in train and test data
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Preserve the LoanID column from the original test data
test_loan_ids = test_data['LoanID']

# Drop the LoanID column
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define the SVM model
svm = SVC()

# Define reduced hyperparameters for tuning
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Make predictions on the test set
test_predictions = best_model.predict(test_data)

# Prepare the sample submission file
sample_submission = pd.DataFrame({'LoanID': test_loan_ids, 'Default': test_predictions})
sample_submission.to_csv('sample_submission.csv', index=False)
print("Submission file created: sample_submission.csv")

Train data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64
Test data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation accuracy: 0.8839194473458035
Validation Accuracy: 0.88
Submission file created: sample_submission.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Apply outlier removal on numerical columns in train and test data
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Preserve the LoanID column from the original test data
test_loan_ids = test_data['LoanID']

# Drop the LoanID column
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define the SVM model with class weights
svm = SVC(class_weight='balanced')

# Define hyperparameters for tuning
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Make predictions on the test set
test_predictions = best_model.predict(test_data)

# Prepare the sample submission file
sample_submission = pd.DataFrame({'LoanID': test_loan_ids, 'Default': test_predictions})
sample_submission.to_csv('sample_submission.csv', index=False)
print("Submission file created: sample_submission.csv")


Train data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64
Test data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Apply outlier removal on numerical columns in train and test data
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Preserve the LoanID column from the original test data
test_loan_ids = test_data['LoanID']

# Drop the LoanID column
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define the SVM model with class weights
svm = SVC(class_weight='balanced')

# Define hyperparameters for tuning
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Make predictions on the test set
test_predictions = best_model.predict(test_data)

# Prepare the sample submission file
sample_submission = pd.DataFrame({'LoanID': test_loan_ids, 'Default': test_predictions})
sample_submission.to_csv('sample_submission.csv', index=False)
print("Submission file created: sample_submission.csv")


Train data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64
Test data null values:
 LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64
Best parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation accuracy: 0.692193781269982
Validation Accuracy: 0.69
Submission file created: sample_submission.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

# Store LoanID from test data for the final submission
test_loan_ids = test_data['LoanID']

# Define the OutlierRemoval class
class OutlierRemoval:
    def _init_(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Identify numerical columns
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

# Apply outlier removal on numerical columns in train and test data
for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, "Bachelor's": 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Feature engineering: Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Drop the LoanID column from train data (not needed for training)
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define parameter distributions for RandomizedSearchCV
param_dist_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': uniform(0.0001, 0.01),
    'learning_rate_init': uniform(0.001, 0.1),
    'max_iter': randint(200, 2000)
}

# Train MLPClassifier with RandomizedSearchCV
mlp = MLPClassifier(random_state=42)
random_search_mlp = RandomizedSearchCV(
    estimator=mlp, param_distributions=param_dist_mlp, n_iter=50, cv=3, n_jobs=-1, scoring='accuracy', random_state=42
)
random_search_mlp.fit(X_train, y_train)

# Print the best parameters for MLPClassifier
print("Best hyperparameters for MLPClassifier: ", random_search_mlp.best_params_)

# Make predictions on the validation set
y_pred_mlp = random_search_mlp.best_estimator_.predict(X_val)

# Evaluate the model
accuracy_mlp = accuracy_score(y_val, y_pred_mlp)
print(f"Validation accuracy for MLPClassifier: {accuracy_mlp:.4f}")

# Make predictions on the test set
test_predictions_mlp = random_search_mlp.best_estimator_.predict(test_data)

# Prepare submission file
sample_submission_mlp = pd.DataFrame({'LoanID': test_loan_ids, 'Default': test_predictions_mlp})
sample_submission_mlp.to_csv('sample_submission_mlp_final.csv', index=False)

print("Submission file 'sample_submission_mlp_final.csv' created successfully.")