In [1]:
import pandas as pd
import numpy as np
import torch
import sklearn

### Importing data

In [81]:
from sklearn.model_selection import train_test_split

#Read in data
train = pd.read_csv('data/train.csv')
#Remove any rows with missing values
df = train.dropna(axis=0, how='any') # figure out how to bootstrap
#Drop columns
df = df.drop(['ID', 'Customer_ID', 'Month', 'Name', 'SSN','Num_Credit_Inquiries', 'Payment_Behaviour'], axis=1)

#Removing extra underscores from columns
df['Age'] = df['Age'].str.replace('_', '')
df['Annual_Income'] = df['Annual_Income'].str.replace('_', '')
df['Num_of_Loan'] = df['Num_of_Loan'].str.replace('_', '')
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace('_', '')
df['Outstanding_Debt'] = df['Outstanding_Debt'].str.replace('_', '')
df['Amount_invested_monthly'] = df['Amount_invested_monthly'].str.replace('_', '')
df['Monthly_Balance'] = df['Monthly_Balance'].str.replace('_', '')
df.dropna(axis=0, how='any', inplace=True)


df.drop(df[df['Occupation'] == '_______'].index, inplace=True)
df.drop(df[df['Credit_Mix'] == '_'].index, inplace=True)
df.drop(df[df['Payment_of_Min_Amount'] == 'NM'].index, inplace=True)

print(df)



  train = pd.read_csv('data/train.csv')


      Age Occupation      Annual_Income  Monthly_Inhand_Salary  \
6      23  Scientist           19114.12            1824.843333   
9      28    Teacher           34847.84            3037.986667   
12     28    Teacher           34847.84            3037.986667   
13     28    Teacher           34847.84            3037.986667   
15     28    Teacher           34847.84            3037.986667   
...    ..        ...                ...                    ...   
98279  24     Lawyer  59868.93000000001            5111.077500   
98280  31  Developer           28796.82            2378.735000   
98285  32  Developer           28796.82            2378.735000   
98300  38     Lawyer           41015.55            3152.962500   
98303  38     Lawyer           41015.55            3152.962500   

       Num_Bank_Accounts  Num_Credit_Card  Interest_Rate Num_of_Loan  \
6                      3                4              3           4   
9                      2                4              6       

In [82]:
#Finding unique loan types and making a new column for each
unique_loans = set()

for row in df['Type_of_Loan']:
    vals = row.split(',')
    for loan in vals:
        unique = loan.lstrip(' and').strip()
        unique_loans.add(unique)

unique_loans.remove('Not Specified')

temp = pd.DataFrame(columns=list(unique_loans), data=np.zeros(shape=(len(df), len(unique_loans)), dtype=np.int8))
# print(temp)

# Updating the columns with correct loans
for i, row in enumerate(df['Type_of_Loan']):
    for loan in unique_loans:
        temp.at[i, loan] = row.count(loan)

# print(temp)
df = pd.concat([df.drop('Type_of_Loan', axis=1).reset_index(drop=True), temp.reset_index(drop=True)], axis=1)

print(df)

      Age Occupation      Annual_Income  Monthly_Inhand_Salary  \
0      23  Scientist           19114.12            1824.843333   
1      28    Teacher           34847.84            3037.986667   
2      28    Teacher           34847.84            3037.986667   
3      28    Teacher           34847.84            3037.986667   
4      28    Teacher           34847.84            3037.986667   
...    ..        ...                ...                    ...   
34103  24     Lawyer  59868.93000000001            5111.077500   
34104  31  Developer           28796.82            2378.735000   
34105  32  Developer           28796.82            2378.735000   
34106  38     Lawyer           41015.55            3152.962500   
34107  38     Lawyer           41015.55            3152.962500   

       Num_Bank_Accounts  Num_Credit_Card  Interest_Rate Num_of_Loan  \
0                      3                4              3           4   
1                      2                4              6       

In [83]:
#Changing credit history to be in terms of months
#df.dropna(axis=0, how='any', inplace=True)
temp = pd.DataFrame(columns=['Credit_History_Age'], data=np.zeros(shape=(len(df), 1), dtype=np.int8))
zero_count = 0

for i, row in enumerate(df['Credit_History_Age']):
        age_series = row.split(' and ')
        years = int(age_series[0].rstrip('Years').strip())
        months = int(age_series[1].rstrip('Months').strip())
        temp.at[i, 'Credit_History_Age'] = years*12+months

df = pd.concat([df.drop('Credit_History_Age', axis=1).reset_index(drop=True), temp.reset_index(drop=True)], axis=1)

print(df)

      Age Occupation      Annual_Income  Monthly_Inhand_Salary  \
0      23  Scientist           19114.12            1824.843333   
1      28    Teacher           34847.84            3037.986667   
2      28    Teacher           34847.84            3037.986667   
3      28    Teacher           34847.84            3037.986667   
4      28    Teacher           34847.84            3037.986667   
...    ..        ...                ...                    ...   
34103  24     Lawyer  59868.93000000001            5111.077500   
34104  31  Developer           28796.82            2378.735000   
34105  32  Developer           28796.82            2378.735000   
34106  38     Lawyer           41015.55            3152.962500   
34107  38     Lawyer           41015.55            3152.962500   

       Num_Bank_Accounts  Num_Credit_Card  Interest_Rate Num_of_Loan  \
0                      3                4              3           4   
1                      2                4              6       

In [84]:

#Get columns ready to encode
encoding_columns = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount']
data_to_encode = df[encoding_columns]


#Creating a OneHotEncoder   
ohe = sklearn.preprocessing.OneHotEncoder()

ohe.fit(data_to_encode)
encoded = ohe.transform(data_to_encode)

#Get feature names
feature_names = ohe.get_feature_names_out(input_features=encoding_columns)
ohe_df = pd.DataFrame.sparse.from_spmatrix(encoded, columns=feature_names)


encoded_df = pd.DataFrame(encoded.toarray(), columns=feature_names)

final_df = pd.concat([df.drop(encoding_columns, axis=1), encoded_df], axis=1)

print(final_df)
print(final_df.shape)

      Age      Annual_Income  Monthly_Inhand_Salary  Num_Bank_Accounts  \
0      23           19114.12            1824.843333                  3   
1      28           34847.84            3037.986667                  2   
2      28           34847.84            3037.986667                  2   
3      28           34847.84            3037.986667                  2   
4      28           34847.84            3037.986667                  2   
...    ..                ...                    ...                ...   
34103  24  59868.93000000001            5111.077500                  4   
34104  31           28796.82            2378.735000                  6   
34105  32           28796.82            2378.735000                  6   
34106  38           41015.55            3152.962500               1194   
34107  38           41015.55            3152.962500                  0   

       Num_Credit_Card  Interest_Rate Num_of_Loan  Delay_from_due_date  \
0                    4              3

In [85]:
# Change variable types of columns
# df = df.astype({'Age': int, 'Occupation': 'string', 'Annual_Income' : float, 'Num_of_Loan': int, 'Type_of_Loan': 'string', 'Num_of_Delayed_Payment': int, 'Outstanding_Debt': float, 'Credit_Mix': 'string', 'Amount_invested_monthly': float, 'Monthly_Balance': float})

x = final_df.drop(['Credit_Score'], axis=1).values
y = final_df['Credit_Score'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, train_size=0.7)

### Decision Tree

In [86]:
from sklearn import tree, metrics
import seaborn as sns 
import matplotlib.pyplot as plt

# makes a decision tree using x/y train data
dtc = tree.DecisionTreeClassifier().fit(x_train, y_train)

# use model for pred
y_pred_dtc = dtc.predict(x_test)

# getting statistics
accuracy = metrics.accuracy_score(y_test, y_pred_dtc)
precision = metrics.precision_score(y_test, y_pred_dtc)
recall = metrics.recall_score(y_test, y_pred_dtc)
f1 = metrics.f1_score(y_test, y_pred_dtc)
cm = metrics.confusion_matrix(y_test, y_pred_dtc)

print(f'accuracy : {accuracy}')
print(f'precision : {precision}')
print(f'recall : {recall}')
print(f'f1 : {f1}')
print(cm)





ValueError: could not convert string to float: '_'