## Logistic Regression Model
**Predicting Stage 4 Liver Cirrhosis Based Upon Key Features**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [3]:
cirr = pd.read_csv('../datasets/cirr_clean.csv')

In [5]:
cirr.head()

Unnamed: 0.1,Unnamed: 0,target,id,n_days,age,bilirubin,cholesterol,albumin,copper,alk_phos,sgot,tryglicerides,platelets,prothrombin,status_C,status_CL,status_D,drug_D-penicillamine,drug_Placebo,sex_F,sex_M,ascites_N,ascites_Y,hepatomegaly_N,hepatomegaly_Y,spiders_N,spiders_Y,edema_N,edema_S,edema_Y
0,0,1,1,400,21464,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1
1,1,0,2,4500,20617,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,1,0,0,1,0,1,0,1,0,0,1,0,1,1,0,0
2,2,1,3,1012,25594,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,0,0,1,1,0,0,1,1,0,1,0,1,0,0,1,0
3,3,1,4,1925,19994,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,0,0,1,1,0,1,0,1,0,0,1,0,1,0,1,0
4,4,0,5,1504,13918,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,0,1,0,0,1,1,0,1,0,0,1,0,1,1,0,0


### Lets look at the correlations of the variables to target

In [6]:
cirr.corr()[['target']].abs().sort_values('target', ascending=False)

Unnamed: 0,target
target,1.0
hepatomegaly_Y,0.438077
hepatomegaly_N,0.438077
ascites_Y,0.37834
ascites_N,0.37834
albumin,0.3686
n_days,0.338234
prothrombin,0.315713
spiders_N,0.282322
spiders_Y,0.282322


## It looks like the presence of hepatomegaly and ascites are the 2 most correlated variables to determining the stage of liver disease.  In addition, labs such as albumin and bilirubin are quite significant.

In [7]:
features = ['hepatomegaly_Y', 'hepatomegaly_N', 'ascites_Y', 'ascites_N', 'albumin', 'prothrombin', 
            'spiders_Y', 'spiders_N', 'edema_Y', 'edema_N', 'copper', 'platelets', 'bilirubin', 'age']

# Define y and X.
y = cirr[['target']]
X = cirr[features]

# Import train_test_split.
from sklearn.model_selection import train_test_split
# Create training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.33,
                                                    random_state = 42)

In [8]:
ss = StandardScaler()
ss.fit(X_train) # ONLY fit to X_train, NEVER to X_test

X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [9]:
# Step 1: Instantiate our model.
logreg = LogisticRegression(solver = 'liblinear', class_weight= {1: 0.1})

# Step 2: Fit our model.
logreg.fit(X_train_sc, y_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-2.30829294]
Logistic Regression Coefficient: [[ 0.3191071  -0.3191071   0.71020614 -0.71020614 -0.33133885  0.13530003
   0.16661104 -0.16661104 -0.21010094 -0.03033622  0.33337928 -0.1478768
  -0.10701121  0.33523945]]


  y = column_or_1d(y, warn=True)


In [10]:
# Step 4: Evaluate model.
logreg.score(X_test_sc, y_test)

0.7934782608695652

### My best test score has 79% accuracy.  I adjusted the threshold via the class weight parameter to 0.1 to minimize false negatives and thereby maximize sensitivity.
***
### Creating a confusion matrix to evaluate Sensitivity and Specificity

In [11]:
# Generate predictions.
preds = logreg.predict(X_test_sc)

In [12]:
def nice_conmat(y_test, preds, classes):
    conmat = confusion_matrix(y_test, preds)
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actually '+ class_ for class_ in classes])
nice_conmat(y_test, preds, ['stages 1-3', 'stage 4'])

Unnamed: 0,Predicted stages 1-3,Predicted stage 4
Actually stages 1-3,65,0
Actually stage 4,19,8


In [13]:
# Save TN/FP/FN/TP values.
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

## Sensitivity

In [14]:
sens = tp / (tp + fn)
print(f'Sensitivity: {round(sens,4)}')

Sensitivity: 0.2963


## Specificity

In [16]:
spec = tn / (tn + fp)
print(f'Specificity: {round(spec,4)}')

Specificity: 1.0
