In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Read dataset
data = pd.read_csv('Indian_liver_patient.csv')

In [13]:
# Encode the gender column
data.gender = data.gender.map({'Male': 1, 'Female': 0})

In [14]:
data.fillna(data.median(), inplace=True)
data.isnull().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             0
is_patient          0
dtype: int64

In [15]:
# Assuming the last column contains labels
labels = data.is_patient.values

# Dropping labels from features
features = data.drop(data.columns[-1], axis=1)

In [16]:
class WOA_LiverDisease:
    def __init__(self, max_iter=200, population_size=20, a=0.5, b=1):
        self.max_iter = max_iter
        self.population_size = population_size
        self.a = a
        self.b = b

    def fit(self, data, labels):
        self.data = data
        self.labels = labels
        # Initialize population with all features selected
        self.population = np.ones((self.population_size, data.shape[1]))
        
        for epoch in range(self.max_iter):
            # Update population
            self.update_population()

        # Select best solution
        best_fitness = min(self.evaluate_fitness())
        best_index = np.argmin(self.evaluate_fitness())
        self.best_solution = self.population[best_index]
        self.best_features = self.data.columns[self.best_solution.astype(bool)].tolist()

    def update_population(self):
        for i in range(self.population_size):
            r = np.random.rand()
            A = 2 * self.a * r - self.a
            C = 2 * r
            p = np.random.rand()

            if p < 0.5:
                if np.abs(A) < 1:
                    self.population[i] = self.search_preys(A, C, i)
                else:
                    rand_leader_index = np.random.randint(0, self.population_size)
                    rand_leader = self.population[rand_leader_index]
                    self.population[i] = rand_leader + A * (rand_leader - self.population[i])
            else:
                distance_to_leader = np.abs(self.population[i] - self.population[0])
                self.population[i] = distance_to_leader * np.exp(self.b * C) * np.cos(2 * np.pi * C) + self.population[0]

    def evaluate_fitness(self):
        fitness = []
        for solution in self.population:
            selected_features = self.data.columns[solution.astype(bool)].tolist()
            # Here, you would apply your liver disease prediction model to evaluate the fitness of each solution
            # For demonstration purposes, let's assume a simple fitness based on the difference between predicted and actual labels
            # Random prediction for demonstration
            predicted_labels = np.random.randint(0, 2, size=len(self.labels))  
            fitness.append(np.abs(predicted_labels - self.labels).sum())
        return fitness
        

    def search_preys(self, A, C, i):
        # Ensure that the solution remains unchanged
        return self.population[i]

In [17]:
# Initialize and fit the model
woa_liver_disease = WOA_LiverDisease()
woa_liver_disease.fit(features, labels)

In [18]:
# Get the best solution and selected features
best_solution = woa_liver_disease.best_solution
selected_features = woa_liver_disease.best_features

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features[selected_features], labels, test_size=0.2, random_state=42)

# Train your classification model using the selected features
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict labels for the test set
predicted_labels = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)

In [20]:
print("Best solution:", best_solution)
print("Selected features:", selected_features)
print("Accuracy:", accuracy*100)

Best solution: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Selected features: ['age', 'gender', 'tot_bilirubin', 'direct_bilirubin', 'tot_proteins', 'albumin', 'ag_ratio', 'sgpt', 'sgot', 'alkphos']
Accuracy: 73.50427350427351
