
# Import Python libraries.

In [None]:
pip install imbalanced-learn
pip install scikit-learn==0.23.1
pip install --upgrade scikit-learn
pip install delayed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import Datasets

In [None]:
train_data = pd.read_csv(r"Training Data.csv")
test_data = pd.read_csv(r"Test Data.csv")

In [None]:
test_data.head()

In [None]:
train_data.head()

# Cleaning the data

In [None]:
# One-Hot Encoding - Train Data
train_data = train_data.merge(pd.get_dummies(train_data['married'], drop_first=True), left_index=True, right_index=True)
train_data = train_data.merge(pd.get_dummies(train_data['house_ownership'], drop_first=True), left_index=True, right_index=True)
train_data = train_data.merge(pd.get_dummies(train_data['car_ownership'], drop_first=True), left_index=True, right_index=True)
train_data = train_data.merge(pd.get_dummies(train_data['profession'], drop_first=True), left_index=True, right_index=True)
# train_data = train_data.merge(pd.get_dummies(train_data['state'], drop_first=True), left_index=True, right_index=True)
train_data = train_data.merge(pd.get_dummies(train_data['city'], drop_first=True), left_index=True, right_index=True)

In [None]:
# One-Hot Encoding - Test Data
test_data = test_data.merge(pd.get_dummies(test_data['married'], drop_first=True), left_index=True, right_index=True)
test_data = test_data.merge(pd.get_dummies(test_data['house_ownership'], drop_first=True), left_index=True, right_index=True)
test_data = test_data.merge(pd.get_dummies(test_data['car_ownership'], drop_first=True), left_index=True, right_index=True)
test_data = test_data.merge(pd.get_dummies(test_data['profession'], drop_first=True), left_index=True, right_index=True)
# test_data = test_data.merge(pd.get_dummies(test_data['state'], drop_first=True), left_index=True, right_index=True)
test_data = test_data.merge(pd.get_dummies(test_data['city'], drop_first=True), left_index=True, right_index=True)

In [None]:
# Normalise Numbers - Train Data

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

train_data = train_data.merge(pd.DataFrame(min_max_scaler.fit_transform(train_data[['income']].values.astype(float))), left_index=True, right_index=True)
train_data = train_data.merge(pd.DataFrame(min_max_scaler.fit_transform(train_data[['age']].values.astype(float))), left_index=True, right_index=True)
train_data = train_data.merge(pd.DataFrame(min_max_scaler.fit_transform(train_data[['experience']].values.astype(float))), left_index=True, right_index=True)
train_data = train_data.merge(pd.DataFrame(min_max_scaler.fit_transform(train_data[['current_job_years']].values.astype(float))), left_index=True, right_index=True)
train_data = train_data.merge(pd.DataFrame(min_max_scaler.fit_transform(train_data[['current_house_years']].values.astype(float))), left_index=True, right_index=True)

In [None]:
test_data # Normalise Numbers - Test Data

test_data = test_data.merge(pd.DataFrame(min_max_scaler.fit_transform(test_data[['income']].values.astype(float))), left_index=True, right_index=True)
test_data = test_data.merge(pd.DataFrame(min_max_scaler.fit_transform(test_data[['age']].values.astype(float))), left_index=True, right_index=True)
test_data = test_data.merge(pd.DataFrame(min_max_scaler.fit_transform(test_data[['experience']].values.astype(float))), left_index=True, right_index=True)
test_data = test_data.merge(pd.DataFrame(min_max_scaler.fit_transform(test_data[['current_job_years']].values.astype(float))), left_index=True, right_index=True)
test_data = test_data.merge(pd.DataFrame(min_max_scaler.fit_transform(test_data[['current_house_years']].values.astype(float))), left_index=True, right_index=True)

In [None]:
# Dropping useless columns - Train Data

# train_data.drop(['state'], axis=1, inplace=True)
train_data.drop(['city'], axis=1, inplace=True)
train_data.drop(['Id'], axis=1, inplace=True)

train_data.drop(['married'], axis=1, inplace=True)
train_data.drop(['house_ownership'], axis=1, inplace=True)
train_data.drop(['car_ownership'], axis=1, inplace=True)
train_data.drop(['profession'], axis=1, inplace=True)
train_data.drop(['state'], axis=1, inplace=True)

train_data.drop(['income'], axis=1, inplace=True)
train_data.drop(['age'], axis=1, inplace=True)
train_data.drop(['experience'], axis=1, inplace=True)
train_data.drop(['current_job_years'], axis=1, inplace=True)
train_data.drop(['current_house_years'], axis=1, inplace=True)

In [None]:
# Dropping useless columns - Test Data

# test_data.drop(['Uttar_Pradesh[5]'], axis=1, inplace=True)
test_data.drop(['city'], axis=1, inplace=True)
test_data.drop(['id'], axis=1, inplace=True)

test_data.drop(['married'], axis=1, inplace=True)
test_data.drop(['house_ownership'], axis=1, inplace=True)
test_data.drop(['car_ownership'], axis=1, inplace=True)
test_data.drop(['profession'], axis=1, inplace=True)
test_data.drop(['state'], axis=1, inplace=True)

test_data.drop(['income'], axis=1, inplace=True)
test_data.drop(['age'], axis=1, inplace=True)
test_data.drop(['experience'], axis=1, inplace=True)
test_data.drop(['current_job_years'], axis=1, inplace=True)
test_data.drop(['current_house_years'], axis=1, inplace=True)

In [None]:
# Change column names
train_data.columns = [*train_data.columns[:-5], 'income', 'age', 'experience', 'current_job_years', 'current_house_years']
test_data.columns = [*test_data.columns[:-5], 'income', 'age', 'experience', 'current_job_years', 'current_house_years']

In [None]:
# Get missing columns in the training test
missing_cols = set( train_data.columns ) - set( test_data.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_data[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_data = test_data[train_data.columns]
test_data.drop(['risk_flag'], axis=1, inplace=True)

In [None]:
y_train = train_data['risk_flag']
X_train = train_data.drop(['risk_flag'], axis=1)

In [None]:
count_classes = pd.value_counts(train_data['risk_flag'], sort=True)
count_classes.plot(kind='bar', rot=0)
plt.title("Loan Risk Distribution")
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
default = train_data[train_data['risk_flag']==1]
normal = train_data[train_data['risk_flag']==0]

# Training and Testing

In [None]:
from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy=1)
X_under, y_under = undersample.fit_resample(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
rf = ExtraTreesClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
y_submit = rf.predict(test_data)

In [None]:
y_submit_csv = pd.DataFrame(y_submit)
y_submit_csv['id'] = y_submit_csv.index + 1
y_submit_csv = y_submit_csv[['id', 0]]
y_submit_csv.columns = ['id', 'risk_flag']
y_submit_csv = y_submit_csv.reset_index(drop=True)

In [None]:
y_submit_csv.head(50)

In [None]:
y_submit_csv.to_csv('submission.csv',index=False)