# Given data about German loans, detect high-risk loans in the data
 We will use logistic regression model to make our predictions 


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
data = pd.read_csv ('./german_credit_data.csv')

In [21]:
data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
Unnamed: 0          1000 non-null int64
Age                 1000 non-null int64
Sex                 1000 non-null object
Job                 1000 non-null int64
Housing             1000 non-null object
Saving accounts     817 non-null object
Checking account    606 non-null object
Credit amount       1000 non-null int64
Duration            1000 non-null int64
Purpose             1000 non-null object
Risk                1000 non-null object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


# PreProcessing 

In [45]:
def preprocess_inputs(df):
    df = df.copy()
    
    #drop duplicate id column 
    df = df.drop('Unnamed: 0', axis = 1)
    
    #Encode missing value as 'none'
    for column in ['Saving accounts', 'Checking account']:
        df[column] = df[column].fillna('none')
        
    # Binary encode the Sex and Risk columns
    df = binary_encode(
        df,
        columns_with_positive_values=[
            ('Sex', 'male'), 
            ('Risk', '')
        ]
    )
    return df

In [38]:
X = preprocess_inputs(data) 

In [39]:
X

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,none,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,none,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,none,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,none,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [40]:
{column: list (X [column].unique ()) for column in X.select_dtypes('object').columns}

{'Sex': ['male', 'female'],
 'Housing': ['own', 'free', 'rent'],
 'Saving accounts': ['none', 'little', 'quite rich', 'rich', 'moderate'],
 'Checking account': ['little', 'moderate', 'none', 'rich'],
 'Purpose': ['radio/TV',
  'education',
  'furniture/equipment',
  'car',
  'business',
  'domestic appliances',
  'repairs',
  'vacation/others'],
 'Risk': ['good', 'bad']}

In [41]:
{column: len (X [column].unique ()) for column in X.select_dtypes('object').columns}

{'Sex': 2,
 'Housing': 3,
 'Saving accounts': 5,
 'Checking account': 4,
 'Purpose': 8,
 'Risk': 2}