In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# load the csv file into a pandas DataFrame
df = pd.read_csv('diseases.csv')

# group the DataFrame by the disease column and concatenate the symptom columns
df_grouped = df.groupby('Disease').agg(lambda x: ','.join(set(x.dropna())))

# reset the index to make the disease column a regular column
df_grouped = df_grouped.reset_index()

# Loop through each row
for index, row in df_grouped.iterrows():
    # Create an empty list to store unique symptoms for the current disease
    unique_symptoms = []
    
    # Loop through each symptom column for the current disease
    for col in df_grouped.columns[1:]:
        # Check if the symptom column has multiple symptoms separated by a comma
        if ',' in row[col]:
            # Split the symptoms by comma and add them to the unique symptoms list
            for symptom in row[col].split(', '):
                if symptom not in unique_symptoms:
                    unique_symptoms.append(symptom)
        else:
            # If the symptom column has only one symptom, add it to the unique symptoms list
            if row[col] not in unique_symptoms:
                unique_symptoms.append(row[col])
    
    # Update the symptom columns for the current disease with the unique symptoms
    df_grouped.loc[index, df_grouped.columns[1:]] = ','.join(unique_symptoms)

# Drop others columns
df_grouped = df_grouped.iloc[:, :2]

# Rename Disease to disease and Symptom_1 to symptoms
df_grouped = df_grouped.rename(columns={'Disease': 'disease'})
df_grouped = df_grouped.rename(columns={'Symptom_1': 'symptoms'})

#Clean blankspaces in symptoms
df_grouped['symptoms'] = df_grouped['symptoms'].str.split(',').apply(lambda x: [s.strip() for s in x]).str.join(',')

# Instantiate the CountVectorizer
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))

# Fit and transform the symptoms column
symptoms_encoded = count_vectorizer.fit_transform(df_grouped['symptoms'])

# Create a new DataFrame with the encoded symptoms
symptoms_df = pd.DataFrame(symptoms_encoded.toarray(), columns=count_vectorizer.get_feature_names_out())

# Concatenate the new symptoms DataFrame with the original DataFrame
df_encoded = pd.concat([df_grouped.drop(columns=['symptoms']), symptoms_df], axis=1)

print(df_encoded.dtypes)


disease                                                                                                                                                                                                                                                                                 object
back_pain,weakness_in_limbs,neck_pain,dizziness,dizziness,loss_of_balance,loss_of_balance,                                                                                                                                                                                               int64
bladder_discomfort,burning_micturition,foul_smell_of urine,continuous_feel_of_urine,                                                                                                                                                                                                     int64
breathlessness,vomiting,sweating,chest_pain,                                                                                               

In [None]:
X = df_encoded.iloc[:, 1:]
y = df_encoded.iloc[:, 0]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.29)

# Define the model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model on the training data
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing data
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Make predictions on new cases
new_cases = np.array([[1, 0, 0, 1, 0], [0, 0, 1, 1, 1]])
new_cases = pd.get_dummies(pd.DataFrame(new_cases, columns=symptoms_encoded.columns))
probabilities = model.predict(new_cases)
print('Probabilities:', probabilities)