In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [80]:
df = pd.read_csv('cleaned_data.csv')
cat_mapping2 = {'graduate school':1,'university':2,'high school':3,'others':4}
df['ED']=df['EDUCATION_CAT'].map(cat_mapping2)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26664 entries, 0 to 26663
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ID                          26664 non-null  object
 1   LIMIT_BAL                   26664 non-null  int64 
 2   SEX                         26664 non-null  int64 
 3   EDUCATION                   26664 non-null  int64 
 4   MARRIAGE                    26664 non-null  int64 
 5   AGE                         26664 non-null  int64 
 6   PAY_1                       26664 non-null  int64 
 7   PAY_2                       26664 non-null  int64 
 8   PAY_3                       26664 non-null  int64 
 9   PAY_4                       26664 non-null  int64 
 10  PAY_5                       26664 non-null  int64 
 11  PAY_6                       26664 non-null  int64 
 12  BILL_AMT1                   26664 non-null  int64 
 13  BILL_AMT2                   26664 non-null  in

In [81]:
Non_features = ['ID','default payment next month', 'EDUCATION_CAT','graduate school','high school','others','university']
X = df.drop(columns=Non_features)
Y = df.iloc[:,24]

In [89]:
scale = StandardScaler()

X_arr = scale.fit_transform(X)
Y_arr = Y

print(X_arr.shape,Y_arr.shape)

(26664, 24) (26664,)


In [288]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 2)

X_pca = pca.fit_transform(X_arr)
print(X_pca.shape)

(26664, 2)


In [289]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_arr, test_size=0.2, random_state=42)

In [273]:
def update_weights_sgd(X_train, y_train, weights, learning_rate):
    for X_each, y_each in zip(X_train, y_train):
        prediction = compute_prediction(X_each, weights)
        weights_delta = X_each.T * (y_each - prediction)
        weights += learning_rate * weights_delta
    return weights

def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
    if fit_intercept:
        intercept = np.ones((X_train.shape[0], 1))
        X_train = np.hstack((intercept, X_train))
    weights = np.zeros(X_train.shape[1])+0.5
    for iteration in range(max_iter):
        weights = update_weights_sgd(X_train, y_train, weights, learning_rate)
        # Check the cost for every 2 (for example) iterations
        if iteration % 2 == 0:
            print(compute_cost(X_train, y_train, weights))
    return weights

def sigmoid(input):
    return 1.0 / (1 + np.exp(-input))

def compute_prediction(X, weights):
    z = np.dot(X, weights)
    predictions = sigmoid(z)
    return predictions

def compute_cost(X, y, weights):
    predictions = compute_prediction(X, weights)
    cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions))
    return cost

def predict(X, weights):
    if X.shape[1] == weights.shape[0] - 1:
        intercept = np.ones((X.shape[0], 1))
        X = np.hstack((intercept, X))
    return compute_prediction(X, weights)

def classification(threshold_P, X, weights):
    vec = predict(X, weights)
    vec = np.where(vec > threshold_P, 1, 0)
    return vec

In [281]:
weights = train_logistic_regression_sgd(X_train, Y_train, 20, 0.001, fit_intercept=True)

0.477872293305873
0.4689399257164773
0.4683245603213618
0.46813368556740637
0.46803519760897777
0.4679785471075477
0.46794550293759524
0.46792628841510625
0.4679151658383863
0.467908741150087


In [284]:
predictions = classification(0.5,X_test,weights)

In [285]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(confusion_matrix(Y_test, predictions))
print(accuracy_score(Y_test, predictions))

[[4042  141]
 [ 848  302]]
0.8145509094318395


In [290]:
clf = LogisticRegression(class_weight='balanced', random_state=42)

parameters = {'penalty': ['l1','l2','none'],
              'C': [0.1, 1, 10],
              'fit_intercept': [True, False],
             'solver': ['newton-cg','lbfgs','liblinear','sag','saga']}

grid_search = GridSearchCV(clf, parameters, n_jobs=-1, cv=5)

grid_search.fit(X_train, Y_train)
print('The best model:\n', grid_search.best_params_)

The best model:
 {'C': 0.1, 'fit_intercept': True, 'penalty': 'l1', 'solver': 'saga'}


120 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/haoli/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/haoli/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/haoli/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got

In [291]:
clf_best = grid_search.best_estimator_
accuracy = clf_best.score(X_test, Y_test)
print(accuracy)

0.6394149634352148




