In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

tf.config.run_functions_eagerly(True)

In [2]:
# load the csv file into a pandas DataFrame
df = pd.read_csv('diseases.csv')

# group the DataFrame by the disease column and concatenate the symptom columns
df_grouped = df.groupby('Disease').agg(lambda x: ','.join(set(x.dropna())))

# reset the index to make the disease column a regular column
df_grouped = df_grouped.reset_index()

# Loop through each row
for index, row in df_grouped.iterrows():
    # Create an empty list to store unique symptoms for the current disease
    unique_symptoms = []
    
    # Loop through each symptom column for the current disease
    for col in df_grouped.columns[1:]:
        # Check if the symptom column has multiple symptoms separated by a comma
        if ',' in row[col]:
            # Split the symptoms by comma and add them to the unique symptoms list
            for symptom in row[col].split(', '):
                if symptom not in unique_symptoms:
                    unique_symptoms.append(symptom)
        else:
            # If the symptom column has only one symptom, add it to the unique symptoms list
            if row[col] not in unique_symptoms:
                unique_symptoms.append(row[col])
    
    # Update the symptom columns for the current disease with the unique symptoms
    df_grouped.loc[index, df_grouped.columns[1:]] = ','.join(unique_symptoms)

# Drop others columns
df_grouped = df_grouped.iloc[:, :2]

# Rename Disease to disease and Symptom_1 to symptoms
df_grouped = df_grouped.rename(columns={'Disease': 'disease'})
df_grouped = df_grouped.rename(columns={'Symptom_1': 'symptoms'})

#Clean blankspaces in symptoms
df_grouped['symptoms'] = df_grouped['symptoms'].str.split(',').apply(lambda x: [s.strip() for s in x]).str.join(',')

In [7]:
# Split the symptoms column into separate columns for each symptom
symptoms_df = df_grouped['symptoms'].str.split(',', expand=True)

# Count the occurrences of each symptom
symptoms_counts = symptoms_df.apply(pd.Series.value_counts)

# Rename the columns to indicate that they represent symptom counts
symptoms_counts.columns = [f'symptom_{col}_count' for col in symptoms_counts.columns]

# Merge the symptom counts with the disease column
df_encoded = pd.concat([df_grouped['disease'], symptoms_counts], axis=1)

In [15]:
X = df_encoded.iloc[:, 1:]
y = df_encoded.iloc[:, 0]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.29)

# remove rows with missing values
X_train = X_train.dropna()
X_test = X_test.dropna()

# convert any float columns to integer
X_train = X_train.astype(int)
X_test = X_test.astype(int)

# Define the model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model on the training data
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing data
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Make predictions on new cases
new_cases = np.array([[1, 0, 0, 1, 0], [0, 0, 1, 1, 1]])
new_cases = pd.get_dummies(pd.DataFrame(new_cases, columns=df_encoded.columns))
probabilities = model.predict(new_cases)
print('Probabilities:', probabilities)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [36]:
# One hot encoding
# separate symptoms into a list
symptoms = data.iloc[:, 1].str.split('\s{2}', expand=True)

# one-hot encode the symptoms
symptoms_encoded = pd.get_dummies(symptoms, prefix='', prefix_sep='')

# add disease column to the encoded symptoms
disease = data.iloc[:, 0]
encoded_data = pd.concat([disease, symptoms_encoded], axis=1)

# group by disease and sum the symptom counts
grouped_data = encoded_data.groupby('Disease').sum().reset_index()

# drop the disease column and convert the symptom data to an array
X = grouped_data.drop('Disease', axis=1).values

# create a dictionary mapping disease names to integers
disease_map = {disease: i for i, disease in enumerate(grouped_data['Disease'].values)}

# encode the target variable as integers
y = data.iloc[:, 0].map(disease_map).values

In [None]:
# Bynary encoding
# separate symptoms into a list
symptoms = data.iloc[:, 1].str.split('\s{2}', expand=True)

# encode the symptoms using binary encoding
encoder = ce.BinaryEncoder(cols=symptoms.columns)
symptoms_encoded = encoder.fit_transform(symptoms)

# add disease column to the encoded symptoms
disease = data.iloc[:, 0]
encoded_data = pd.concat([disease, symptoms_encoded], axis=1)

# group by disease and sum the symptom counts
grouped_data = encoded_data.groupby('Disease').sum().reset_index()

# drop the disease column and convert the symptom data to an array
X = grouped_data.drop('Disease', axis=1).values

# create a dictionary mapping disease names to integers
disease_map = {disease: i for i, disease in enumerate(grouped_data['Disease'].values)}

# encode the target variable as integers
y = data.iloc[:, 0].map(disease_map).values

In [None]:
# Count encode
# count encode the symptoms
encoder = ce.CountEncoder(cols=symptoms.columns)
symptoms_encoded = encoder.fit_transform(symptoms)

# add disease column to the encoded symptoms
disease = data.iloc[:, 0]
encoded_data = pd.concat([disease, symptoms_encoded], axis=1)

# group by disease and sum the symptom counts
grouped_data = encoded_data.groupby('Disease').sum().reset_index()

# drop the disease column and convert the symptom data to an array
X = grouped_data.drop('Disease', axis=1).values

# create a dictionary mapping disease names to integers
disease_map = {disease: i for i, disease in enumerate(grouped_data['Disease'].values)}

# encode the target variable as integers
y = data.iloc[:, 0].map(disease_map).values
