In [10]:
import pandas as pd
import xgboost as xg 
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [27]:
data = pd.read_csv('clean_cs-training.csv')

data

Unnamed: 0,gender,marital status,no_of_dependent,type of residence,educational_attainment,employment_status,sector_of_employment,current_employer,monthly net income,work_start_date,...,loan_requests,failed_loan_requests,successful_loan_requests,loans,lenders,first_account,last_account,phone_numbers,created_on,status_id
0,Male,Single,1,Rented,MSc,Employed,Others,Hancock-Bradshaw,5411,2019-07-30,...,1,2,7,3,19,2014-08-01,2023-07-05,2,2022-01-30,78
1,Male,Married,1,Rented,PhD,Employed,Banking,"Gutierrez, Blair and Robinson",8508,2017-07-06,...,8,1,2,3,2,2018-12-20,2023-09-27,1,2020-07-09,52
2,Male,Married,3,Rented,PhD,Unemployed,Other Financial,Ingram-Williams,1664,2015-04-02,...,9,2,7,0,6,2021-12-04,2023-08-10,1,2023-01-20,41
3,Male,Single,2,Rented,MSc,Unemployed,Wholesale and Retail Trade,Perry Group,3350,2014-12-21,...,1,4,0,2,8,2017-08-03,2023-05-24,2,2020-04-17,10
4,Male,Married,1,Own House,BSc,Employed,Others,Smith-Gardner,2951,2023-04-30,...,7,0,2,4,19,2021-01-03,2023-10-22,1,2021-04-28,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,Female,Married,0,Rented,PhD,Unemployed,,Rodriguez and Sons,7821,2020-01-27,...,8,3,6,4,7,2018-11-25,2023-11-22,1,2022-06-23,64
1028,Male,Divorced,1,With Parents,BSc,Unemployed,,"Wilson, Jackson and Ryan",3004,2024-02-15,...,6,2,3,3,3,2014-10-19,2024-01-22,1,2023-08-17,65
1029,Female,Divorced,2,Rented,MSc,Self-employed,,Wilson Inc,5967,2023-11-14,...,0,4,0,1,6,2016-10-27,2022-07-23,2,2022-06-19,91
1030,Female,Single,4,Own House,PhD,Unemployed,,Farmer-Sutton,6294,2023-05-09,...,0,2,6,0,13,2023-02-24,2023-08-24,2,2023-10-29,82


In [28]:
info = data.info()
summary = data.columns

print(info)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    1032 non-null   object
 1   marital status            1032 non-null   object
 2   no_of_dependent           1032 non-null   int64 
 3   type of residence         1032 non-null   object
 4   educational_attainment    1032 non-null   object
 5   employment_status         1032 non-null   object
 6   sector_of_employment      32 non-null     object
 7   current_employer          1032 non-null   object
 8   monthly net income        1032 non-null   int64 
 9   work_start_date           1032 non-null   object
 10  work_email_validated      1032 non-null   bool  
 11  country                   1032 non-null   object
 12  city                      1032 non-null   object
 13  requested_amount          1032 non-null   int64 
 14  purpose                 

In [29]:
# columns_to_drop = ['sector_of_employment','org_id', 'user_id', 'loan id', 'photo url', 'work_email', 'bank', 'os_version', 'mobile_os', 'device_name', 'logins', 'LGA', 'phone_network', 'lending_lenders', 'emails', 'bank']

drop = ['sector_of_employment']

data = data.drop(columns=drop)
data.shape

(1032, 30)

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    1032 non-null   object
 1   marital status            1032 non-null   object
 2   no_of_dependent           1032 non-null   int64 
 3   type of residence         1032 non-null   object
 4   educational_attainment    1032 non-null   object
 5   employment_status         1032 non-null   object
 6   sector_of_employment      32 non-null     object
 7   current_employer          1032 non-null   object
 8   monthly net income        1032 non-null   int64 
 9   work_start_date           1032 non-null   object
 10  work_email_validated      1032 non-null   bool  
 11  country                   1032 non-null   object
 12  city                      1032 non-null   object
 13  requested_amount          1032 non-null   int64 
 14  purpose                 

In [30]:
# data.to_csv('clean_cs-training.csv', index=False)

In [48]:
format_list = ['%Y-%m-%d']

date_columns = ['work_start_date', 'proposed_payday', 'loan_request_day', 'first_account', 'last_account', 'created_on']

for col in data[date_columns]: 
    for fmt in format_list:
        try:
            data[col] = pd.to_datetime(data[col]).dt.year
            break 
        except ValueError:
            pass 

In [51]:
# categorical_columns = ['gender', 'country', 'city', 'language', 'selfie_bvn_check', 'selfie_id_check', 'type of residence', 'marital status', 'educational_attainment', 'employment_status','sector_of_employment', 'current_employer', ]

le = LabelEncoder()

for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = le.fit_transform(data[column])
        
data['selfie_bvn_check'] = data['selfie_bvn_check'].astype(int)
data['selfie_id_check'] = data['selfie_id_check'].astype(int)
data['work_email_validated'] = data['work_email_validated'].astype(int)


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   gender                    1032 non-null   int64
 1   marital status            1032 non-null   int64
 2   no_of_dependent           1032 non-null   int64
 3   type of residence         1032 non-null   int64
 4   educational_attainment    1032 non-null   int64
 5   employment_status         1032 non-null   int64
 6   current_employer          1032 non-null   int64
 7   monthly net income        1032 non-null   int64
 8   work_start_date           1032 non-null   int32
 9   work_email_validated      1032 non-null   int64
 10  country                   1032 non-null   int64
 11  city                      1032 non-null   int64
 12  requested_amount          1032 non-null   int64
 13  purpose                   1032 non-null   int64
 14  proposed_payday           1032 non-null 

In [53]:
x = data.drop('status_id', axis=1)
y = data['status_id']



In [54]:
xg_model = xg.XGBRegressor(
    objective='reg:squarederror',
    eval_method='rmse',
    n_estimators=1000,
    max_depth=12,
    learning_rate=0.01,
    colsample_bytree=0.5
)

xg_model.fit(x, y)

Parameters: { "eval_method" } are not used.



In [56]:
booster = xg_model.get_booster()

feature_importance = booster.get_score(importance_type='gain')

feature_importance

{'gender': 52.34090042114258,
 'marital status': 65.08482360839844,
 'no_of_dependent': 90.91046142578125,
 'type of residence': 78.7470474243164,
 'educational_attainment': 86.10330200195312,
 'employment_status': 87.21261596679688,
 'current_employer': 143.7548828125,
 'monthly net income': 157.5016326904297,
 'work_start_date': 123.1712417602539,
 'work_email_validated': 107.9237060546875,
 'country': 202.4521026611328,
 'city': 200.348876953125,
 'requested_amount': 210.59812927246094,
 'purpose': 157.66380310058594,
 'loan_request_hour': 180.3955535888672,
 'loan_request_day': 153.43634033203125,
 'age': 183.27256774902344,
 'selfie_bvn_check': 164.5871124267578,
 'selfie_id_check': 157.8338623046875,
 'loan_requests': 184.450439453125,
 'failed_loan_requests': 178.2594451904297,
 'successful_loan_requests': 170.5683135986328,
 'loans': 184.64950561523438,
 'lenders': 222.9583740234375,
 'first_account': 204.12782287597656,
 'last_account': 177.77162170410156,
 'phone_numbers': 18

#### Testing/Accuracy check

In [80]:
test_data = pd.read_csv('new_cs-test.csv')
columns_to_drop = ['sector_of_employment','org_id', 'user_id', 'loan id', 'photo url', 'work_email', 'bank', 'os_version', 'mobile_os', 'device_name', 'logins', 'LGA', 'phone_network', 'lending_lenders', 'emails', 'bank']

test_data.drop(columns=columns_to_drop, inplace=True)

test_data.to_csv('clean_cs-test.csv', index=False)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    32 non-null     object
 1   marital status            32 non-null     object
 2   no_of_dependent           32 non-null     int64 
 3   type of residence         32 non-null     object
 4   educational_attainment    32 non-null     object
 5   employment_status         32 non-null     object
 6   current_employer          32 non-null     object
 7   monthly net income        32 non-null     object
 8   work_start_date           32 non-null     object
 9   work_email_validated      32 non-null     bool  
 10  country                   32 non-null     object
 11  city                      32 non-null     object
 12  requested_amount          32 non-null     int64 
 13  purpose                   32 non-null     object
 14  proposed_payday           32

In [85]:

categorical_columns = ['gender', 'country', 'city', 'selfie_bvn_check', 'selfie_id_check', 'type of residence', 'marital status', 'educational_attainment', 'employment_status','sector_of_employment', 'current_employer', ]

le = LabelEncoder()

for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data[column] = le.fit_transform(test_data[column])
        
test_data['selfie_bvn_check'] = test_data['selfie_bvn_check'].astype(int)
test_data['selfie_id_check'] = test_data['selfie_id_check'].astype(int)
test_data['work_email_validated'] = test_data['work_email_validated'].astype(int)




In [86]:
format_list = ['%Y-%m-%d']

date_columns = ['work_start_date', 'proposed_payday', 'loan_request_day', 'first_account', 'last_account', 'created_on']

for col in test_data[date_columns]: 
    for fmt in format_list:
        try:
            test_data[col] = pd.to_datetime(test_data[col]).dt.year
            break 
        except ValueError:
            pass 
        
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   gender                    32 non-null     int64
 1   marital status            32 non-null     int64
 2   no_of_dependent           32 non-null     int64
 3   type of residence         32 non-null     int64
 4   educational_attainment    32 non-null     int64
 5   employment_status         32 non-null     int64
 6   current_employer          32 non-null     int64
 7   monthly net income        32 non-null     int64
 8   work_start_date           32 non-null     int32
 9   work_email_validated      32 non-null     int64
 10  country                   32 non-null     int64
 11  city                      32 non-null     int64
 12  requested_amount          32 non-null     int64
 13  purpose                   32 non-null     int64
 14  proposed_payday           32 non-null     in

In [89]:
# x_test = test_data.drop(columns=['status_id'], axis=1)
# y_test = test_data['status_id']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,shuffle=True)


model_accuracy = 100 * xg_model.score(x_test, y_test)

print(f'The model has an accuracy of {model_accuracy:.2f}%')

The model has an accuracy of 100.00%
