In [1]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from pathlib import Path

Info on dataset: [South German Credit Dataset](https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29)

In [2]:
file = Path('../Resources/german_credit.csv')

In [3]:
credit_df = pd.read_csv(file)
credit_df.head()

Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1.0
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1.0
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1.0
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1.0
4,1,10,4,0,2241,1,2,1,3,1,...,1,48,3,1,2,2,1,1,1,1.0


In [4]:
# Count of rows with null values
credit_df.count()

laufkont    1000
laufzeit    1000
moral       1000
verw        1000
hoehe       1000
sparkont    1000
beszeit     1000
rate        1000
famges      1000
buerge      1000
wohnzeit    1000
verm        1000
alter       1000
weitkred    1000
wohn        1000
bishkred    1000
beruf       1000
pers        1000
telef       1000
gastarb     1000
kredit       800
dtype: int64

In [5]:
# Delete rows with null values
credit_df_clean = credit_df.dropna(how="any")
credit_df_clean.count()

laufkont    800
laufzeit    800
moral       800
verw        800
hoehe       800
sparkont    800
beszeit     800
rate        800
famges      800
buerge      800
wohnzeit    800
verm        800
alter       800
weitkred    800
wohn        800
bishkred    800
beruf       800
pers        800
telef       800
gastarb     800
kredit      800
dtype: int64

In [6]:
# Separate the dataset into data and target
X = credit_df_clean.drop(['kredit'], axis=1)
y = credit_df_clean["kredit"]

In [7]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create steps and pipeline
steps = [("scaler", StandardScaler()),
        ("pca", PCA(n_components=.9)), 
        ("lr", LogisticRegression())]

pipe = Pipeline(steps)

In [9]:
params = {'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'lr__solver': ['sag', 'lbfgs']}

In [10]:
# Run GridSearchCV
grid_clf = GridSearchCV(pipe, params)
grid_clf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA(n_components=0.9)),
                                       ('lr', LogisticRegression())]),
             param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'lr__solver': ['sag', 'lbfgs']})

In [11]:
# Evaluate performance
print(grid_clf.score(X_test, y_test))
predictions = grid_clf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

0.765
              precision    recall  f1-score   support

         0.0       0.73      0.28      0.41        57
         1.0       0.77      0.96      0.85       143

    accuracy                           0.77       200
   macro avg       0.75      0.62      0.63       200
weighted avg       0.76      0.77      0.73       200



In [12]:
# Extract best params
print(grid_clf.best_params_)

{'lr__C': 10, 'lr__solver': 'sag'}
