In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy  as np
import pandas as pd
from copy import deepcopy
from random import randint
from sklearn.metrics import classification_report, confusion_matrix,precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn import metrics
import random as rnd

In [3]:
data = pd.read_csv("/content/drive/My Drive/hcvdata.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [4]:
data['Category'].unique()

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

In [5]:
data['Sex'].unique()

array(['m', 'f'], dtype=object)

### Data Feature Engineering and Missing Values

In [6]:
df = data.drop(['Unnamed: 0'],axis =1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  615 non-null    object 
 1   Age       615 non-null    int64  
 2   Sex       615 non-null    object 
 3   ALB       614 non-null    float64
 4   ALP       597 non-null    float64
 5   ALT       614 non-null    float64
 6   AST       615 non-null    float64
 7   BIL       615 non-null    float64
 8   CHE       615 non-null    float64
 9   CHOL      605 non-null    float64
 10  CREA      615 non-null    float64
 11  GGT       615 non-null    float64
 12  PROT      614 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 62.6+ KB


In [8]:
df.isnull().sum()

Category     0
Age          0
Sex          0
ALB          1
ALP         18
ALT          1
AST          0
BIL          0
CHE          0
CHOL        10
CREA         0
GGT          0
PROT         1
dtype: int64

In [9]:
def impute_nan(df,variable,median):
    df[variable+"_median"]=df[variable].fillna(median)
    df[variable+"_random"]=df[variable]
    ##It will have the random sample to fill the na
    random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0)
    ##pandas need to have same index in order to merge the dataset
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable+'_random']=random_sample

In [10]:
median=df.ALP.median()
median=df.ALB.median()
median=df.ALT.median()
median=df.CHOL.median()
median=df.PROT.median()

In [11]:
impute_nan(df,'ALP',median)
impute_nan(df,'ALB',median)
impute_nan(df,'ALT',median)
impute_nan(df,'CHOL',median)
impute_nan(df,'PROT',median)

In [12]:
print('------------ALP--------')
print(df['ALP'].std())
print(df['ALP_median'].std())
print(df['ALP_random'].std())
print('------------ALB--------')
print(df['ALB'].std())
print(df['ALB_median'].std())
print(df['ALB_random'].std())
print('------------ALT--------')
print(df['ALT'].std())
print(df['ALT_median'].std())
print(df['ALT_random'].std())
print('------------CHOL--------')
print(df['CHOL'].std())
print(df['CHOL_median'].std())
print(df['CHOL_random'].std())
print('------------PROT--------')
print(df['PROT'].std())
print(df['PROT_median'].std())
print(df['PROT_random'].std())

------------ALP--------
26.028315300123676
25.652462679110126
25.891276597748643
------------ALB--------
5.7806294041030775
5.9060799975961045
5.77747160043219
------------ALT--------
25.46968881387094
25.510011883263825
25.45716861228685
------------CHOL--------
1.132728431159735
8.533679735037218
1.1307378041704204
------------PROT--------
5.402635737104956
5.398238062526439
5.3987465675293995


In [13]:
df.columns

Index(['Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE',
       'CHOL', 'CREA', 'GGT', 'PROT', 'ALP_median', 'ALP_random', 'ALB_median',
       'ALB_random', 'ALT_median', 'ALT_random', 'CHOL_median', 'CHOL_random',
       'PROT_median', 'PROT_random'],
      dtype='object')

In [14]:
df=df.drop(['ALB','ALP','ALT','CHOL','PROT','ALP_median','ALB_random','ALT_random','CHOL_median','PROT_median'],axis=1)

In [15]:
df['Category'] = df['Category'].map({"0=Blood Donor":0,"0s=suspect Blood Donor":4,"1=Hepatitis":1,"2=Fibrosis":2,'3=Cirrhosis':3})

In [16]:
X_sex = pd.get_dummies(df['Sex'],drop_first=True)

In [17]:
df = pd.concat([df,X_sex], axis=1)

In [18]:
df = df.drop('Sex', 1)

In [19]:
df.rename(columns={'m': 'Sex'}, inplace=True)

In [20]:
df = df[['Age', 'AST', 'BIL', 'CHE', 'CREA', 'GGT', 'ALP_random','ALB_median', 'ALT_median', 'CHOL_random', 'PROT_random', 'Sex','Category']]

In [21]:
 X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)

In [22]:
types = X_test['Category']
X_test = X_test.iloc[:,:-1]

In [23]:
test = types.tolist()

### C 4.5 Rule Based Classifier

In [24]:
targetEntropy = 0
def getUniqueClasses(dataFrameColumn):
    results = {}
    for row in dataFrameColumn:
        if row not in results: results[row] = 0 
        results[row] += 1 
    return results
def getEntropy(data, column):
    entropy = 0.0
    results = getUniqueClasses(data[column]) 
    for row in results.values():
        p = float(row) / len(data[column]) 
        entropy -= p * np.log2(p) 
    return entropy
def findSplitPoints(data, column):
    sorted = data.sort_values([column], ascending=True)
    sorted_matrix = sorted[[column, 'Category']].to_numpy()
    splitPoints = []
    previous = sorted_matrix[0][1] 
    index = sorted.index.values; 
    counter = 0
    for row in sorted_matrix:
        if row[1] != previous: 
            splitPoints.append([index[counter - 1], sorted_matrix[counter - 1][0]])
        counter += 1
        previous = row[1]
    return splitPoints
def splitSets(data, column, splitPoints):
    sets_below = []
    sets_above = []
    for i in range(len(splitPoints)):
        df1 = data[data[column] <= data[column][splitPoints[i][0]]]  
        df2 = data[data[column] > data[column][splitPoints[i][0]]]  
        # add to the lists
        sets_below.append(df1)
        sets_above.append(df2)
    return sets_below, sets_above
def getInformationGain(data, column):
    splitpoints = findSplitPoints(data, column)  
    sets_below, sets_above = splitSets(data, column, splitpoints) 
    instances_above = []
    instances_below = []
    entropy_above = []
    entropy_below = []
    target_entropy = getEntropy(data, 'Category') 
    for set in sets_below:
        entropy_below.append(getEntropy(set, 'Category'))
        instances_below.append(len(set))
    for set in sets_above:
        entropy_above.append(getEntropy(set, 'Category'))
        instances_above.append(len(set))
    totalInstances = []
    infoGains = []
    for i in range(len(instances_below)):
        totalInstances.append(instances_below[i] + instances_above[i])
        probA = (instances_above[i] / float(totalInstances[i]))
        probB = (instances_below[i] / float(totalInstances[i]))
        infoGains.append(target_entropy - ((entropy_below[i] * probB) + (entropy_above[i] * probA)))
    best_gain = i = counter = 0
    for gain in infoGains:
        if best_gain < gain:
            best_gain = gain
            counter = i 
        i += 1
    return best_gain, sets_below[counter], sets_above[counter], splitpoints[counter]
def train(data):
    optimal_gain = -1
    best = {}
    columns = []
    i = 0
    for column in data:  # loop over each attribute
        if column != 'Category':
            try:
                ig, set1, set2, split = getInformationGain(data, column)  
                columns.append({"ig": ig, "left": set1, "right": set2, 'col': i, 'split': split,'colName': column})
            except IndexError:
                columns.append({"ig": 0, "left": [], "right": [], 'col': column, })
        i += 1
    for val in range(len(columns)):
        if columns[val]['ig'] > optimal_gain:
            best = columns[val]
            optimal_gain = columns[val]['ig']
    left = best['left']
    right = best['right']
    if len(best['left']) != 0 and len(best['right']) != 0:
        return (treeNode(col=best['col'], colName=best['colName'], value=best['split'][1], results=None,
                              rb=train(right), lb=train(left)))
    else:
        label = list(getUniqueClasses(data['Category']).keys()); 
        return (treeNode(results=label[0]))
def classify(target_row, tree):
    if tree.results != None:
        return tree.results
    else:
        val = target_row[tree.col]
        branch = None
        if isinstance(val, int) or isinstance(val, float):
            if val >= tree.value:
                branch = tree.rb
            else:
                branch = tree.lb
        return classify(target_row, branch)
def printTree(tree, space=''):
    if tree.results != None:
        print(str(tree.results))
    else:
        print(str(tree.colName) + ' : ' + str(tree.value)) 
        print(space + 'L ', end="") 
        printTree(tree.lb, space + ' ') 
        print(space + 'R ', end="") 
        printTree(tree.rb, space + ' ')
def test_tree(data, labels, tree):
    values = []
    pred = []
    for index, row in data.iterrows():
        values.append([index, classify(row, tree)])
        pred.append(classify(row, tree))
    indexes = labels.index.values
    correct = incorrect = 0
    for l in range(len(values)):
        if values[l][0] == indexes[l] and values[l][1] == labels[indexes[l]]:
            correct += 1 
        else:
            incorrect += 1 
    return incorrect, correct, np.round(100 - (incorrect / (incorrect + correct)) * 100),pred
def main():
    results = [];
    tests = 1
    for i in range(tests):
        tree = train(X_train)
        incorrect, correct, accuracy,pred = test_tree(X_test, types, tree)
        results.append(accuracy)
        print("Test " + str(i + 1) + "\n------------")
        print("Tree Generated:" + "\n")
        printTree(tree)
        print()
        print("Correctly Classified: " + str(correct) + " / " + str(correct+incorrect))
        print("Accuracy: " + str(accuracy))
        print()
    sum = 0
    for r in range(len(results)):
        sum += results[r]
    average = sum/tests
    print(metrics.confusion_matrix(types,pred))
    print(metrics.classification_report(types,pred, digits=3))
class treeNode():
    def __init__(self, col=-1, colName='', value=None, results=None, rb=None, lb=None):
        self.col = col 
        self.colName = colName; 
        self.value = value 
        self.results = results 
        self.rb = rb 
        self.lb= lb 
if __name__ == '__main__':
    main()

Test 1
------------
Tree Generated:

AST : 52.6
L ALT_median : 9.5
 L ALP_random : 42.2
  L CREA : 60.8
   L 1
   R Age : 49
    L 4
    R 2
  R CREA : 82.0
   L 0
   R 3
 R AST : 31.0
  L CHE : 3.44
   L 4
   R 0
  R Age : 30
   L 1
   R ALP_random : 41.6
    L 1
    R CHOL_random : 3.1
     L 2
     R PROT_random : 63.0
      L CHE : 6.06
       L 0
       R 1
      R 0
R ALB_median : 36.0
 L BIL : 9.8
  L 4
  R Age : 64
   L 3
   R 2
 R ALP_random : 47.5
  L ALP_random : 39.6
   L Age : 32
    L 1
    R CHE : 9.24
     L 2
     R 1
   R AST : 187.9
    L 2
    R 0
  R AST : 70.0
   L AST : 69.2
    L 0
    R 2
   R AST : 285.8
    L 3
    R 1

Correctly Classified: 162 / 185
Accuracy: 88.0

[[150   0   0   2   1]
 [  3   5   0   0   2]
 [  3   1   2   2   0]
 [  2   0   3   5   1]
 [  1   1   0   1   0]]
              precision    recall  f1-score   support

           0      0.943     0.980     0.962       153
           1      0.714     0.500     0.588        10
           2      