# Logistic Regression

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [229]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)

In [28]:
clf.predict(X[:2, :])

array([0, 0])

In [29]:
clf.predict_proba(X[:2, :])

array([[9.81797141e-01, 1.82028445e-02, 1.44269293e-08],
       [9.71725476e-01, 2.82744937e-02, 3.01659208e-08]])

In [30]:
clf.score(X, y)

0.9733333333333334

# Confusion Matrix

In [31]:
from sklearn.metrics import confusion_matrix

In [24]:
confusion_matrix(y, clf.predict(X))

array([[50,  0,  0],
       [ 0, 47,  3],
       [ 0,  1, 49]], dtype=int64)

In [22]:
confusion_matrix(y, clf.predict(X), normalize='true')

array([[1.  , 0.  , 0.  ],
       [0.  , 0.94, 0.06],
       [0.  , 0.02, 0.98]])

# 1. Solve classification problem using 'classification.csv' dataset

##### target variable is 'default'. Apply feature selection, feature scaling, cross validation etc. (anything you think is needed)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
cd C:\Users\Dell\Desktop\Machine Learning\Programming

C:\Users\Dell\Desktop\Machine Learning\Programming


In [4]:
df = pd.read_csv('Class 10/classification.csv')
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,college degree,17,12,176,9.3,11.359392,5.008608,1
1,27,no high school,10,6,31,17.3,1.362202,4.000798,0
2,40,no high school,15,14,55,5.5,0.856075,2.168925,0
3,41,no high school,15,14,120,2.9,2.65872,0.82128,0
4,24,high school,2,0,28,17.3,1.787436,3.056564,1


In [5]:
df.nunique()

age          37
ed            5
employ       32
address      31
income      114
debtinc     231
creddebt    695
othdebt     699
default       2
dtype: int64

In [6]:
df['ed'].value_counts()

no high school    372
high school       198
college degree     87
undergraduate      38
postgraduate        5
Name: ed, dtype: int64

In [7]:
df = df.replace({'college degree':'graduate','undergraduate':'graduate','postgraduate': 'graduate'})
df['ed'].value_counts()

no high school    372
high school       198
graduate          130
Name: ed, dtype: int64

In [8]:
y=df['default']
y.head()

0    1
1    0
2    0
3    0
4    1
Name: default, dtype: int64

In [9]:
df1=pd.get_dummies(df['ed'])
df1.head()

Unnamed: 0,graduate,high school,no high school
0,1,0,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,1,0


In [10]:
df=pd.merge(df,df1, on=df.index)
df.head()

Unnamed: 0,key_0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default,graduate,high school,no high school
0,0,41,graduate,17,12,176,9.3,11.359392,5.008608,1,1,0,0
1,1,27,no high school,10,6,31,17.3,1.362202,4.000798,0,0,0,1
2,2,40,no high school,15,14,55,5.5,0.856075,2.168925,0,0,0,1
3,3,41,no high school,15,14,120,2.9,2.65872,0.82128,0,0,0,1
4,4,24,high school,2,0,28,17.3,1.787436,3.056564,1,0,1,0


In [11]:
df.drop(['key_0','ed'], axis = 1, inplace=True)
df.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default,graduate,high school,no high school
0,41,17,12,176,9.3,11.359392,5.008608,1,1,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,0,1
2,40,15,14,55,5.5,0.856075,2.168925,0,0,0,1
3,41,15,14,120,2.9,2.65872,0.82128,0,0,0,1
4,24,2,0,28,17.3,1.787436,3.056564,1,0,1,0


### Normalization

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [13]:
df.corr()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default,graduate,high school,no high school
age,1.0,0.536497,0.597591,0.47871,0.016398,0.295207,0.340217,-0.137657,0.049278,-0.059646,0.015432
employ,0.536497,1.0,0.322334,0.619681,-0.031182,0.403694,0.406091,-0.282978,-0.102431,-0.073388,0.146061
address,0.597591,0.322334,1.0,0.316245,0.011323,0.208435,0.226514,-0.164451,0.059675,0.001322,-0.047697
income,0.47871,0.619681,0.316245,1.0,-0.026777,0.570199,0.610659,-0.07097,0.211881,0.014392,-0.178107
debtinc,0.016398,-0.031182,0.011323,-0.026777,1.0,0.501767,0.58487,0.389575,0.032429,-0.034308,0.005693
creddebt,0.295207,0.403694,0.208435,0.570199,0.501767,1.0,0.633104,0.24474,0.117123,-0.038828,-0.056229
othdebt,0.340217,0.406091,0.226514,0.610659,0.58487,0.633104,1.0,0.145713,0.178752,-0.012977,-0.127588
default,-0.137657,-0.282978,-0.164451,-0.07097,0.389575,0.24474,0.145713,1.0,0.092082,0.052241,-0.118909
graduate,0.049278,-0.102431,0.059675,0.211881,0.032429,0.117123,0.178752,0.092082,1.0,-0.299927,-0.508591
high school,-0.059646,-0.073388,0.001322,0.014392,-0.034308,-0.038828,-0.012977,0.052241,-0.299927,1.0,-0.668829


In [14]:
df.drop(['default'], axis = 1, inplace=True)
df.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,graduate,high school,no high school
0,41,17,12,176,9.3,11.359392,5.008608,1,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,1
2,40,15,14,55,5.5,0.856075,2.168925,0,0,1
3,41,15,14,120,2.9,2.65872,0.82128,0,0,1
4,24,2,0,28,17.3,1.787436,3.056564,0,1,0


In [15]:
kf = KFold(n_splits=10)
ts_rmse = []
tr_rmse = []

for train_index, test_index in kf.split(df):
    xTrain, xTest = df.iloc[train_index], df.iloc[test_index]    
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]
    
    norm=StandardScaler().fit(xTrain)
    X_ts_normalized = norm.transform(xTest)
    X_tr_normalized = norm.transform(xTrain)
    
    lr = LogisticRegression()
    model = lr.fit(X_tr_normalized, yTrain)
    predictions = lr.predict(X_ts_normalized)
    predictions_train = lr.predict(X_tr_normalized)
    
    ts_rmse.append(np.sqrt(mean_squared_error(yTest, predictions)))
    tr_rmse.append(np.sqrt(mean_squared_error(yTrain, predictions_train)))

In [27]:
model.coef_

array([[ 0.27953833, -1.52771811, -0.72614123, -0.20565646,  0.58600515,
         1.13415016,  0.09645625,  0.06469351,  0.08053975, -0.12396313]])

In [16]:
print("Test RMSE", np.mean(ts_rmse))
print("Train RMSE", np.mean(tr_rmse)) 

Test RMSE 0.4456543250542116
Train RMSE 0.43238776262111606


# 2. Print accuracy, confusion matrix, precision, recall, sensitivity and specifity on train and test (and maybe validation) datasets.

##### do not use any libraries for metrics, implement yourself

In [17]:
df.head()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,graduate,high school,no high school
0,41,17,12,176,9.3,11.359392,5.008608,1,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,1
2,40,15,14,55,5.5,0.856075,2.168925,0,0,1
3,41,15,14,120,2.9,2.65872,0.82128,0,0,1
4,24,2,0,28,17.3,1.787436,3.056564,0,1,0


In [18]:
X_Train, X_Test, y_Train, y_Test = train_test_split(df[:int(0.8*len(df))], y[:int(0.8*len(y))], test_size=0.2)

In [19]:
norm1=StandardScaler().fit(X_Train)
X_test_normalized = norm1.transform(X_Test)
X_train_normalized = norm1.transform(X_Train)

In [20]:
reg=LogisticRegression(random_state=3).fit(X_train_normalized, y_Train)
test_pred=reg.predict(X_test_normalized)
train_pred=reg.predict(X_train_normalized)
print('Test RMSE:', np.sqrt(mean_squared_error(y_Test, test_pred)))
print('Train RMSE:', np.sqrt(mean_squared_error(y_Train, train_pred)))

Test RMSE: 0.4629100498862757
Train RMSE: 0.43042752832835535


In [21]:
x_norm=norm1.transform(df[int(0.8*len(df)):])
y_predictions=reg.predict(x_norm)
print('Final test RMSE:', np.sqrt(mean_squared_error(y[int(0.8*len(y)):], y_predictions)))

Final test RMSE: 0.4225771273642583


In [23]:
m=np.array([test_pred, train_pred, y_predictions])
p=np.array([y_Test,y_Train,  y[int(0.8*len(y)):]])
TP=0
FP=0
FN=0
TN=0
for j in range(len(m)):
    for i in range(len(m[j])):
        if list(m[j])[i]==list(p[j])[i]==1:
            TP=TP+1
        elif list(m[j])[i]==list(p[j])[i]==0:
            TN=TN+1
        elif list(m[j])[i]==1 and list(p[j])[i]==0:
            FP=FP+1
        else:
            FN=FN+1
    print("True Positives:", TP,"True Negatives:", TN, "False Positives:", FP, "False Negatives:", FP)
    Accuracy=(TP+TN)/(TP+TN+FP+FN)
    print(Accuracy)
    Sens=TP/(TP+FN)
    Spec=TN/(TN+FP)
    Prec=TP/(TP+FP)
    NPV=TN/(TN+FN)
    print('Sensitivity or Recall:', Sens,'\n' 'Specifity:',Spec,'\n' 'Precision', Prec)
    cf=np.array([[TP, FN], [FP, TN]]) 
    print('Confusion Matrix: \n', cf)

True Positives: 18 True Negatives: 70 False Positives: 8 False Negatives: 8
0.7857142857142857
Sensitivity or Recall: 0.5294117647058824 
Specifity: 0.8974358974358975 
Precision 0.6923076923076923
Confusion Matrix: 
 [[18 16]
 [ 8 70]]
True Positives: 66 True Negatives: 387 False Positives: 29 False Negatives: 29
0.8089285714285714
Sensitivity or Recall: 0.4583333333333333 
Specifity: 0.9302884615384616 
Precision 0.6947368421052632
Confusion Matrix: 
 [[ 66  78]
 [ 29 387]]
True Positives: 85 True Negatives: 483 False Positives: 34 False Negatives: 34
0.8114285714285714
Sensitivity or Recall: 0.4644808743169399 
Specifity: 0.9342359767891683 
Precision 0.7142857142857143
Confusion Matrix: 
 [[ 85  98]
 [ 34 483]]
