DATA LOADING AND EXPLORATION

In [None]:
# Import Pandas, numpy, matplotlib and seaborn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
# Load the Disease Symptom data set, checking first few rows
disease = pd.read_csv("disease_symptoms_dataset.csv")
print(disease.head())

In [None]:
# Find basic data set parameters: shape, missing values
print("Shape: ", disease.shape)
print("\n")
print(disease.info())
print("\n")

# 790 entries, 10 columns.

# Exact missing value count
missing_values = disease.isnull().sum()
print("Missing Values: \n", missing_values)

# Symptom columns gain more null values with each column iteration (expected).
# Severity has high number of null values.  Investigate this further.

# Visualize missing values with seaborn
sns.heatmap(disease.isnull(), cbar=False, cmap='viridis')
plt.title("Heatmap of Missing Values")
plt.show()

In [None]:
# Based on the missing values distribution and data type of the column, it would appear that
# Symptom_4 is holding data for column Severity.  These two will be merged and Symptom_4 deleted.
disease["Severity"] = disease["Severity"].fillna(disease["Symptom_4"])
disease = disease.drop(["Symptom_4"], axis=1)
print(disease.head())
print("\n")

# Check missing values for Severity now, and make sure Severity is still a float type.
missing_values = disease.isnull().sum()
print("Missing Values: \n", missing_values)
print("\n")
print(disease.info())

# All missing values of Severity are gone, and it is still a float type object.

In [None]:
# Investigate Symptom predictor columns
print("Symptom 1: \n")
print(disease["Symptom_1"].value_counts())
print("\n")

print("Symptom 2: \n")
print(disease["Symptom_2"].value_counts())
print("\n")

print("Symptom 3: \n")
print(disease["Symptom_3"].value_counts())
print("\n")

# Some symptoms are shared between columns.  Therefore, the list of possible
# symptoms in each column is not unique.

# Impute missing values in symptoms with "None"
disease = disease.fillna("None")
print("Missing Values: \n", disease.isnull().sum())

In [None]:
# Investigate disease distribution
target = "Disease"
print(disease[target].value_counts())

# Visualize disease distribution
sns.countplot(x=target, data=disease)
plt.title("Disease Variable Distribution")
plt.xticks([])
plt.show()

# According to distribution and target value counts, many diseases only have a single entry - many more than that
# have under 10.  

VARIABLE PREPROCESSING

In [None]:
# Majority of variables are categorical.  Import sklearn label encoder for binary variables, use
# one-hot encoding for multiclass.
from sklearn.preprocessing import LabelEncoder

# Create duplicate of disease as a checkpoint variable
disease_tr = disease

In [None]:
# Encode Gender variable
lb_enc = LabelEncoder()
disease_tr["Gender"] = lb_enc.fit_transform(disease_tr["Gender"])

# Encode multiclass variables with get_dummies
multiclass = ["Symptom_1", "Symptom_2", "Symptom_3"]
disease_tr = pd.get_dummies(disease_tr, columns = multiclass)

# Encode disease label with label encoder
disease_tr["Disease"] = lb_enc.fit_transform((disease_tr["Disease"]))

In [None]:
# Implement a standard scalar for latitude and longitude
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
lat_long = ["Latitude", "Longitude"]
disease_tr[lat_long] = scaler.fit_transform(disease_tr[lat_long])

In [None]:
# In order to train the model, classes with only 1 entry must be dropped.  Keep only rows where the disease class
# is in the top 10 most common diseases for the data set.

top_10_diseases = disease_tr['Disease'].value_counts().nlargest(10).index

# Filter the dataset to keep only rows where Disease is in the top 10
disease_tr = disease_tr[disease_tr['Disease'].isin(top_10_diseases)]

TRAINING

In [None]:
# Seperate features and target variable
x = disease_tr.drop("Disease", axis = 1)
y = disease_tr["Disease"]

In [None]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=76)


In [None]:
# Train the model on a random forest classifier model
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
dis_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
dis_rf.fit(x_train, y_train)

In [None]:
# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = dis_rf.predict(x_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print(classification_report(y_test, y_pred))

# Model perfectly identifies test cases

In [None]:
# Create confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
class_names = list(disease_tr.Disease.unique())

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)

# Add labels, title, and ticks
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Identify feature importances
feature_importances = dis_rf.feature_importances_

# Combine feature names and importances into a DataFrame
feature_importance = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Get the top 10 features
top_10_features = feature_importance.head(10)

# Display the top 10 features
print("top 10 features: \n", top_10_features)

# Plot the top 10 features
plt.figure(figsize=(10, 6))
plt.barh(top_10_features['Feature'], top_10_features['Importance'])
plt.gca().invert_yaxis()  # Invert the y-axis to display the most important feature at the top
plt.title('Top 10 Most Important Features')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

The takeaway appears to be that the most important predictors for a given disease are the severity of the symptoms, location, age of patient, and the presence of several common symptoms (nausea, cough, headache, itchy eyes, sneezing, and joint pain).  It is important to note that these are very generic symptoms often associated with the common cold, yet different combinations could also indicate several other diseases (flu, allergies, etc.).