In [34]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [35]:
lr = LogisticRegression()
svc = LinearSVC()
rf = RandomForestClassifier()

In [36]:
df = pd.read_csv('hepatitis.csv')

In [37]:
df.head(5)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live


In [38]:
df.isnull().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

In [39]:
#for back fill
df.fillna(method='bfill',inplace=True)
#for forward-fill
df.fillna(method='ffill',inplace=True)

In [40]:
class_le = LabelEncoder()
# M -> 1 and B -> 0

df.rename(columns={'class':'classification'}, inplace=True)
df['sex'] = class_le.fit_transform(df.sex.values)
df['steroid'] = class_le.fit_transform(df.steroid.values)
df['fatigue'] = class_le.fit_transform(df.fatigue.values)
df['malaise'] = class_le.fit_transform(df.malaise.values)
df['anorexia'] = class_le.fit_transform(df.anorexia.values)
df['liver_big'] = class_le.fit_transform(df.liver_big.values)
df['liver_firm'] = class_le.fit_transform(df.liver_firm.values)
df['spleen_palpable'] = class_le.fit_transform(df.spleen_palpable.values)
df['spiders'] = class_le.fit_transform(df.spiders.values)
df['ascites'] = class_le.fit_transform(df.ascites.values)
df['varices'] = class_le.fit_transform(df.varices.values)
df['antivirals'] = class_le.fit_transform(df.antivirals.values)
df['classification'] = class_le.fit_transform(df.classification.values)
df['histology'] = class_le.fit_transform(df.histology.values)

In [41]:
df.groupby('classification').size()

classification
0     32
1    123
dtype: int64

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              155 non-null    int64  
 1   sex              155 non-null    int64  
 2   steroid          155 non-null    int64  
 3   antivirals       155 non-null    int64  
 4   fatigue          155 non-null    int64  
 5   malaise          155 non-null    int64  
 6   anorexia         155 non-null    int64  
 7   liver_big        155 non-null    int64  
 8   liver_firm       155 non-null    int64  
 9   spleen_palpable  155 non-null    int64  
 10  spiders          155 non-null    int64  
 11  ascites          155 non-null    int64  
 12  varices          155 non-null    int64  
 13  bilirubin        155 non-null    float64
 14  alk_phosphate    155 non-null    float64
 15  sgot             155 non-null    float64
 16  albumin          155 non-null    float64
 17  protime         

In [43]:
df.shape

(155, 20)

In [44]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2)

In [45]:
train.shape
train.head(4)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,classification
6,51,0,0,0,1,0,1,1,0,1,1,0,0,1.0,78.0,48.0,4.4,85.0,0,0
133,72,0,1,1,1,0,0,1,1,0,0,0,0,1.0,115.0,52.0,3.4,50.0,1,1
44,34,0,1,0,0,0,0,1,0,0,0,0,0,0.7,74.0,86.0,4.4,46.0,0,1
87,30,0,1,0,1,1,1,1,1,0,1,1,1,2.5,165.0,64.0,2.8,84.0,1,0


In [46]:
test.shape

(31, 20)

In [14]:
# train.iloc[:,:32].head(5)

In [60]:
train_feature_names = train.columns[:-1]
train_feat = train[train_feature_names]
# "diagnosis" feature is our class which I wanna predict
train_tar = train.classification

In [61]:
test_feature_names = test.columns[:-1]
test_feat = test[test_feature_names]
# "diagnosis" feature is our class which I wanna predict
test_tar = test.classification

In [62]:
train_feat.shape

(124, 19)

In [63]:
train_tar.shape

(124,)

In [64]:
test_feat.shape

(31, 19)

In [65]:
test_tar.shape

(31,)

In [66]:
lr.fit(train_feat, train_tar)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
lr.score(train_feat, train_tar)

0.8951612903225806

In [68]:
lr.score(test_feat, test_tar)

0.9032258064516129

In [69]:
svc.fit(train_feat, train_tar)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [70]:
svc.score(train_feat, train_tar)

0.7983870967741935

In [71]:
svc.score(test_feat, test_tar)

0.8387096774193549