In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
# Define genetic algorithm parameters
population_size = 100
num_generations = 50
mutation_rate = 0.01

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('/content/latest_lung_cancer.csv')

In [5]:
X = df.drop(columns=['lung_cancer'])  # Assuming 'lung_cancer' is the target variable
y = df['lung_cancer']

In [6]:
X.fillna(X.mean(numeric_only=True), inplace=True)


In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['gender', 'smoking', 'yellow_fingers', 'anxiety', 'peer_pressure', 'chronic_disease','fatigue', 'allergy', 'alcohol_consuming', 'coughing', 'shortness_of_breath', 'swallowing_difficulty', 'chest_pain']
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Scale the features (optional, depending on your model)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Define the initial population
def initialize_population(population_size, num_features):
    population = []
    for _ in range(population_size):
        chromosome = [random.randint(0, 1) for _ in range(num_features)]
        population.append(chromosome)
    return population

In [11]:
# Define a fitness function (using Gaussian Naive Bayes classifier and accuracy as the metric)
def fitness(chromosome, X_train, y_train, X_test, y_test):
    selected_features = [i for i, bit in enumerate(chromosome) if bit]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature set
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    # Train a Gaussian Naive Bayes classifier
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train_selected, y_train)
    # Make predictions on the test set
    y_pred = nb_classifier.predict(X_test_selected)

    # Calculate accuracy as the fitness
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [12]:
    # Define the selection function (roulette wheel selection)
def select_parents(population, X_train, y_train, X_test, y_test):
  fitness_scores = [fitness(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
  total_fitness = sum(fitness_scores)
  probabilities = [score / total_fitness for score in fitness_scores]
  selected_indices = np.random.choice(range(len(population)), size=len(population), p=probabilities)
  parents = [population[i] for i in selected_indices]
  return parents

In [13]:
# Define the crossover function (one-point crossover)
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2

In [14]:
# Define the mutation function (bit-flip mutation)
def mutate(chromosome):
    mutated_chromosome = [bit ^ 1 if random.random() < mutation_rate else bit for bit in chromosome]
    return mutated_chromosome
    # Main genetic algorithm loop
population = initialize_population(population_size, X.shape[1])

In [15]:
for generation in range(num_generations):
    # Evaluate the fitness of each chromosome
    fitness_scores = [fitness(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]

    # Select parents for the next generation
    parents = select_parents(population, X_train, y_train, X_test, y_test)

    # Create a new population through crossover and mutation
    new_population = []
    while len(new_population) < population_size:
        parent1 = random.choice(parents)
        parent2 = random.choice(parents)
        child1, child2 = crossover(parent1, parent2)
        child1 = mutate(child1)
        child2 = mutate(child2)
        new_population.extend([child1, child2])
         # Replace the old population with the new population
    population = new_population
    # Find the best chromosome in the final population
best_chromosome = population[np.argmax([fitness(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])]
# Use the best chromosome for feature selection
selected_features = [i for i, bit in enumerate(best_chromosome) if bit]
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]
# Train the final Gaussian Naive Bayes model on the selected features and evaluate it
final_nb_classifier = GaussianNB()
final_nb_classifier.fit(X_train_selected, y_train)
y_pred = final_nb_classifier.predict(X_test_selected)
final_accuracy = accuracy_score(y_test, y_pred)

print("Final Accuracy (Gaussian Naive Bayes):", final_accuracy)

Final Accuracy (Gaussian Naive Bayes): 0.967741935483871


In [16]:
import pickle
import os

In [17]:
# Save the final Gaussian Naive Bayes model to a PKL file
# # Define the file path
# model_path = 'C:/Users/personal/Desktop/Lung Cancer FYP/final_gaussian_nb_model.pkl'

# # Ensure the directory exists
# os.makedirs(os.path.dirname(model_path), exist_ok=True)

# # Save the final Gaussian Naive Bayes model to a PKL file
# print("Saving the model...")
# with open(model_path, 'wb') as model_file:
#     pickle.dump(final_nb_classifier, model_file)
# print(f"Model saved to {model_path}")

# # Verify the file exists after saving
# if os.path.exists(model_path):
#     print(f"Model file successfully saved at {model_path}")
# else:
#     print(f"Failed to save model file at {model_path}")

# # List files in the directory
# print("Files in the directory:")
# print(os.listdir(os.path.dirname(model_path)))
model_data = {
    'model': final_nb_classifier,
    'selected_features': selected_features
}

model_filename = 'final_gaussian_nb_model.pkl'
print("Saving the model and selected features...")
with open(model_filename, 'wb') as model_file:
    pickle.dump(model_data, model_file)
print(f"Model saved to {model_filename}")

Saving the model and selected features...
Model saved to final_gaussian_nb_model.pkl


In [21]:
# Load the trained Gaussian Naive Bayes model from the PKL file
# with open('C:/Users/personal/Desktop/Lung Cancer FYP/final_gaussian_nb_model.pkl', 'rb') as model_file:
with open('final_gaussian_nb_model.pkl', 'rb') as model_file:
    # loaded_gaussian_nb_model = pickle.load(model_file)
    model_data = pickle.load(model_file)
    loaded_gaussian_nb_model = model_data['model']
    selected_features = model_data['selected_features']

# Use the loaded model and selected features for predictions
X_test_selected = X_test[:, selected_features]  # Ensure you select the same features as used during training
y_pred = loaded_gaussian_nb_model.predict(X_test_selected)


# # Use the loaded model for predictions
# y_pred = loaded_gaussian_nb_model.predict(X_test_selected)