In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the data from the CSV file
data = pd.read_csv('diseases.csv')

# Use the index to get the symptom column names
symptom_columns = data.columns[1:]

# Use apply to concatenate all symptom columns into a single column
data['symptoms'] = data[symptom_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Drop the original symptom columns
data = data.drop(symptom_columns, axis=1)

# Group the data by disease and concatenate the symptoms
grouped_data = data.groupby('Disease')['symptoms'].apply(lambda x: ' '.join(x)).reset_index()

# Rename the 'symptoms' column to 'combined_symptoms'
data = grouped_data.rename(columns={'symptoms': 'combined_symptoms'})

# Bynary encoding
# separate symptoms into a list
symptoms = data.iloc[:, 1].str.split('\s{2}', expand=True)

# encode the symptoms using binary encoding
encoder = ce.BinaryEncoder(cols=symptoms.columns)
symptoms_encoded = encoder.fit_transform(symptoms)

# add disease column to the encoded symptoms
disease = data.iloc[:, 0]
encoded_data = pd.concat([disease, symptoms_encoded], axis=1)

# group by disease and sum the symptom counts
grouped_data = encoded_data.groupby('Disease').sum().reset_index()

# drop the disease column and convert the symptom data to an array
X = grouped_data.drop('Disease', axis=1).values

# create a dictionary mapping disease names to integers
disease_map = {disease: i for i, disease in enumerate(grouped_data['Disease'].values)}

# encode the target variable as integers
y = data.iloc[:, 0].map(disease_map).values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.29)

# Define the model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model on the training data
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing data
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Make predictions on new cases
new_cases = np.array([[1, 0, 0, 1, 0], [0, 0, 1, 1, 1]])
new_cases = pd.get_dummies(pd.DataFrame(new_cases, columns=symptoms_encoded.columns))
probabilities = model.predict(new_cases)
print('Probabilities:', probabilities)