In [403]:
%matplotlib inline
import numpy as np
import pandas as pd
import cufflinks as cf  
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
cf.go_offline()

In [404]:
liver_patient = pd.read_csv("duomenys/Indian_Liver_Patient.csv")
liver_patient.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [405]:
#pervadiname stulpelį dataset į liver_disease
liver_patient.rename(columns={'Dataset': 'Liver_Disease'}, inplace=True)
liver_patient['Gender'] = liver_patient['Gender'].apply(lambda x: 1 if x =='Male' else 0)
#Female=0 and Male = 1
liver_patient['Albumin_and_Globulin_Ratio'].mean()
liver_patient=liver_patient.fillna(0.94)

In [406]:
liver_patient.groupby('Liver_Disease').size()

Liver_Disease
1    416
2    167
dtype: int64

In [407]:
columns = liver_patient.columns # Pasižiūrime visų stulpelių pavadinimus

In [408]:
# Pasidarome X ir Y, kurį prognozuosime
X = liver_patient[['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio']]
y = liver_patient['Liver_Disease']

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [409]:
# Pasidarome test ir train rinkinius

Xtrain, Xtest, ytrain, ytest = train_test_split( 
        X, y, test_size = 0.3, random_state = 0)


PIRMAS MODELIS

In [410]:
from sklearn.utils.class_weight import compute_sample_weight

weights = compute_sample_weight(class_weight='balanced', y = ytrain)
pd.Series(weights).head(10)

0    1.789474
1    0.693878
2    0.693878
3    0.693878
4    0.693878
5    0.693878
6    1.789474
7    0.693878
8    0.693878
9    0.693878
dtype: float64

In [411]:
model = LogisticRegression()

model.fit(Xtrain, ytrain, sample_weight=weights)
pred = model.predict(Xtest)
prob = model.predict_proba(Xtest)





In [412]:
print('Log-loss = {0:.02f}'.format(
    log_loss(ytest, prob)))
print('Hit rate = {0:.02%}'.format(
    (ytest == pred).mean()))

Log-loss = 0.58
Hit rate = 66.29%


In [413]:
pd.DataFrame(confusion_matrix(ytest, pred),
             columns = model.classes_, index = model.classes_)

Unnamed: 0,1,2
1,72,50
2,9,44


In [414]:
print(classification_report(ytest, pred))

              precision    recall  f1-score   support

           1       0.89      0.59      0.71       122
           2       0.47      0.83      0.60        53

    accuracy                           0.66       175
   macro avg       0.68      0.71      0.65       175
weighted avg       0.76      0.66      0.68       175



In [415]:
np.array([columns[0:-1]]).T
model.coef_.T

array([[-0.35492901],
       [-0.03787205],
       [-0.18330738],
       [-0.7388045 ],
       [-0.10798907],
       [-1.29267415],
       [-1.13414811],
       [-0.75578336],
       [ 0.75870293],
       [-0.13012083]])

In [416]:
feature_importance = pd.DataFrame(np.hstack((np.array([columns[0:-1]]).T, model.coef_.T)),
                                  columns=['feature', 'importance'])
feature_importance['importance'] = pd.to_numeric(feature_importance['importance'])
feature_importance.sort_values(by='importance', ascending=False)


Unnamed: 0,feature,importance
8,Albumin,0.758703
1,Gender,-0.037872
4,Alkaline_Phosphotase,-0.107989
9,Albumin_and_Globulin_Ratio,-0.130121
2,Total_Bilirubin,-0.183307
0,Age,-0.354929
3,Direct_Bilirubin,-0.738804
7,Total_Protiens,-0.755783
6,Aspartate_Aminotransferase,-1.134148
5,Alamine_Aminotransferase,-1.292674


ANTRAS MODELIS 
(Pridėjus naujus stulpelius)


In [329]:
#Apsiskaiciuojame Globulina
liver_patient['Globulin'] = liver_patient['Albumin'] / liver_patient['Albumin_and_Globulin_Ratio']

#Susiskaiciuojam netiesiogini bilirubina (nekonjuguotas) (jis lygus is bendro atemus tiesiogini)
liver_patient['Indirect_Bilirubin'] = liver_patient['Total_Bilirubin'] - liver_patient['Direct_Bilirubin']
liver_patient['Protiens_Ratio'] = liver_patient['Total_Protiens'] / liver_patient['Albumin']

liver_patient.head()


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Disease,Globulin,Indirect_Bilirubin,Protiens_Ratio
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1,3.666667,0.6,2.060606
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1,4.324324,5.4,2.34375
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1,3.707865,3.2,2.121212
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1,3.4,0.6,2.0
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1,6.0,1.9,3.041667


In [330]:
liver_patient.columns

X = liver_patient[['Age', 'Gender', 'Total_Bilirubin', 'Alkaline_Phosphotase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 
        'Globulin', 'Indirect_Bilirubin', 'Protiens_Ratio']]
y = liver_patient['Liver_Disease']

scaler = StandardScaler()
X = scaler.fit_transform(X)

Xtrain, Xtest, ytrain, ytest = train_test_split( 
        X, y, test_size = 0.3, random_state = 0)


In [331]:
model2 = LogisticRegression(class_weight='balanced')

model2.fit(Xtrain, ytrain)
pred2 = model2.predict(Xtest)
prob2 = model2.predict_proba(Xtest)

print('Log-loss = {0:.02f}'.format(
    log_loss(ytest, prob2)))
print('Hit rate = {0:.02%}'.format(
    (ytest == pred2).mean()))

Log-loss = 0.59
Hit rate = 62.86%






In [332]:
# suskaičiuojame Cofusion matrix
pd.DataFrame(confusion_matrix(ytest, pred2),
             columns=model2.classes_, index=model2.classes_)

Unnamed: 0,1,2
1,69,53
2,12,41


In [333]:
print(classification_report(ytest, pred2))

              precision    recall  f1-score   support

           1       0.85      0.57      0.68       122
           2       0.44      0.77      0.56        53

    accuracy                           0.63       175
   macro avg       0.64      0.67      0.62       175
weighted avg       0.73      0.63      0.64       175



TREČIAS MODELIS (išmetus stulpelius, kurie stipriai koreliuoja su kitais)

In [334]:
liver_patient = liver_patient.drop(['Direct_Bilirubin','Alamine_Aminotransferase','Albumin_and_Globulin_Ratio'],axis=1)
liver_patient.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Alkaline_Phosphotase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Liver_Disease,Globulin,Indirect_Bilirubin,Protiens_Ratio
0,65,0,0.7,187,18,6.8,3.3,1,3.666667,0.6,2.060606
1,62,1,10.9,699,100,7.5,3.2,1,4.324324,5.4,2.34375
2,62,1,7.3,490,68,7.0,3.3,1,3.707865,3.2,2.121212
3,58,1,1.0,182,20,6.8,3.4,1,3.4,0.6,2.0
4,72,1,3.9,195,59,7.3,2.4,1,6.0,1.9,3.041667


In [335]:
liver_patient.columns

X = liver_patient[['Age', 'Gender', 'Total_Bilirubin', 'Alkaline_Phosphotase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
    'Globulin', 'Indirect_Bilirubin', 'Protiens_Ratio']]
y = liver_patient['Liver_Disease']

scaler = StandardScaler()
X = scaler.fit_transform(X)

Xtrain, Xtest, ytrain, ytest = train_test_split( 
        X, y, test_size = 0.3, random_state = 0)

In [336]:
model3 = LogisticRegression(class_weight='balanced')

model3.fit(Xtrain, ytrain)
pred3 = model3.predict(Xtest)
prob3 = model3.predict_proba(Xtest)

print('Log-loss = {0:.02f}'.format(
    log_loss(ytest, prob3)))
print('Hit rate = {0:.02%}'.format(
    (ytest == pred3).mean()))

Log-loss = 0.59
Hit rate = 62.86%






In [337]:
# suskaičiuojame Cofusion matrix
pd.DataFrame(confusion_matrix(ytest, pred3),
             columns = model3.classes_, index = model3.classes_)

Unnamed: 0,1,2
1,69,53
2,12,41


In [338]:
print(classification_report(ytest, pred3))

              precision    recall  f1-score   support

           1       0.85      0.57      0.68       122
           2       0.44      0.77      0.56        53

    accuracy                           0.63       175
   macro avg       0.64      0.67      0.62       175
weighted avg       0.73      0.63      0.64       175

