In [2]:
# pip install scikit-learn

In [3]:
# Importing Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [4]:
# Read Raw Dataset
df = pd.read_excel(r'C:\Users\21099\Downloads\Disease Prediction System ML\Disease Prediction System ML\raw_data.xlsx')

In [5]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [6]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [7]:
df = pd.DataFrame(data)

# Function to summarize DataFrame
def summarize_dataframe(df):
    summary = {
        'Number of Rows': df.shape[0],
        'Number of Columns': df.shape[1],
        'Column Names': df.columns.tolist(),
        'Data Types': df.dtypes.tolist(),
        'Summary Statistics': df.describe().transpose(),
        'Missing Values': df.isnull().sum(),
        'Unique Values': df.nunique(),
        'Top Values': df.mode().iloc[0],
    }
    return summary

df_summary = summarize_dataframe(df)

for key, value in df_summary.items():
    print(f'{key}:')
    print(value)
    print('\n')

# The data below summarises the data of the.xls sheet

Number of Rows:
1866


Number of Columns:
3


Column Names:
['Disease', 'Count of Disease Occurrence', 'Symptom']


Data Types:
[dtype('O'), dtype('float64'), dtype('O')]


Summary Statistics:
                              count        mean         std   min   25%  \
Count of Disease Occurrence  1866.0  253.405145  354.214115  42.0  92.0   

                               50%    75%     max  
Count of Disease Occurrence  144.0  280.0  3363.0  


Missing Values:
Disease                        0
Count of Disease Occurrence    0
Symptom                        0
dtype: int64


Unique Values:
Disease                        134
Count of Disease Occurrence    103
Symptom                        401
dtype: int64


Top Values:
Disease                           UMLS:C0005586_bipolar disorder
Count of Disease Occurrence                                 68.0
Symptom                        UMLS:C0392680_shortness of breath
Name: 0, dtype: object




In [8]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [9]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [10]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [11]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [12]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [14]:
# Save cleaned data as CSV
f = open(r'C:\Users\21099\Downloads\Disease Prediction System ML\Disease Prediction System ML\cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [20]:
# Read Cleaned Data as DF
df = pd.read_csv('C:\\Users\\21099\\Downloads\\Disease Prediction System ML\\Disease Prediction System ML\\cleaned_data.csv', encoding='latin1')
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [21]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [22]:
n_unique = len(df['symptom'].unique())
n_unique

404

In [23]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [24]:
# Converting data into numerical data so that it can be used for processing.
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [None]:
# One-hot encoding is a technique used to represent categorical variables as binary vectors. In one-hot encoding, each category is represented as a binary vector where all elements are zero except for the index corresponding to the category, which is set to one.

# For example, consider a categorical variable "Color" with three categories: "Red", "Green", and "Blue". After one-hot encoding, the categories are represented as follows:

# "Red" is represented as [1, 0, 0]
# "Green" is represented as [0, 1, 0]
# "Blue" is represented as [0, 0, 1]

In [26]:
onehot_encoder = OneHotEncoder()
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

  (0, 328)	1.0
  (1, 87)	1.0
  (2, 28)	1.0
  (3, 112)	1.0
  (4, 359)	1.0
  (5, 393)	1.0
  (6, 355)	1.0
  (7, 356)	1.0
  (8, 263)	1.0
  (9, 235)	1.0
  (10, 20)	1.0
  (11, 289)	1.0
  (12, 283)	1.0
  (13, 281)	1.0
  (14, 328)	1.0
  (15, 257)	1.0
  (16, 28)	1.0
  (17, 235)	1.0
  (18, 250)	1.0
  (19, 304)	1.0
  (20, 355)	1.0
  (21, 356)	1.0
  (22, 384)	1.0
  (23, 219)	1.0
  (24, 393)	1.0
  :	:
  (2101, 168)	1.0
  (2102, 86)	1.0
  (2103, 402)	1.0
  (2104, 76)	1.0
  (2105, 392)	1.0
  (2106, 54)	1.0
  (2107, 353)	1.0
  (2108, 365)	1.0
  (2109, 38)	1.0
  (2110, 396)	1.0
  (2111, 194)	1.0
  (2112, 200)	1.0
  (2113, 15)	1.0
  (2114, 110)	1.0
  (2115, 331)	1.0
  (2116, 104)	1.0
  (2117, 273)	1.0
  (2118, 118)	1.0
  (2119, 148)	1.0
  (2120, 35)	1.0
  (2121, 35)	1.0
  (2122, 296)	1.0
  (2123, 361)	1.0
  (2124, 130)	1.0
  (2125, 122)	1.0


In [27]:
onehot_encoded[0]

<1x404 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [28]:
cols = np.asarray(df['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

In [29]:
# Creating a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts


In [30]:
for i in range(onehot_encoded.shape[0]):
    df_ohe.loc[i] = onehot_encoded[i].toarray()[0]


In [32]:
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
len(df_ohe)

2126

In [34]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [35]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [37]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
len(df_concat)

2097

In [39]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=405)

In [40]:
cols = cols[1:]

In [41]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:5]

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pneumocystis carinii pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquired immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
len(df_concat)

149

In [44]:
df_concat.to_csv(r"C:\Users\21099\Downloads\Disease Prediction System ML\Disease Prediction System ML\training_data.csv", index=False)

In [45]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

In [46]:
# MODEL TRAINING

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [48]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [49]:
len(X_train), len(y_train)

(119, 119)

In [50]:
len(X_test), len(y_test)

(30, 30)

In [51]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X, y)

In [52]:
clf_dt.score(X, y)

0.9731543624161074

In [44]:
export_graphviz(dt, 
                out_file='./tree.dot', 
                feature_names=cols)

In [53]:
# from graphviz import Source
# from sklearn import tree

# # graph = Source(export_graphviz(dt,
#                 out_file=None, 
#                 feature_names=cols))

# png_bytes = graph.pipe(format='png')

# with open('/Users/shubham/Desktop/Disease Prediction System ML/tree.png', 'wb') as f:
#     f.write(png_bytes)

In [54]:
# from IPython.display import Image
# Image(png_bytes)

In [55]:
# disease_pred = clf_dt.predict(X)

In [48]:
# disease_real = y.values

In [49]:
# for i in range(0, len(disease_real)):
#     if disease_pred[i]!=disease_real[i]:
#         print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

In [64]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import os

# Increase figure size for better readability
plt.figure(figsize=(50, 40))

# Plot the decision tree
plot_tree(dt, 
          feature_names=cols, 
          class_names=dt.classes_, 
          filled=True, 
          fontsize=10, 
          max_depth=3, 
          rounded=True, 
          precision=2, 
          proportion=False, 
          node_ids=True)

# Define the path to save the image
output_path = 'C:\\Users\\21099\\Downloads\\Disease Prediction System ML\\Disease Prediction System ML\\tree_visualization_improved.png'

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save the plot with high DPI (dots per inch) for better resolution
plt.savefig(output_path, dpi=600, bbox_inches='tight')

# Optionally, display the plot. This might be impractical for very large trees
plt.show()


InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=404) instead.

<Figure size 5000x4000 with 0 Axes>

In [65]:
disease_pred = dt.predict(X_test)

# Define disease_real using the actual labels in the test set
disease_real = y_test.values

# Compare predicted and actual labels
for i in range(len(disease_real)):
    if disease_pred[i] != disease_real[i]:
        print('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))


Pred: malignant neoplasms
Actual: primary malignant neoplasm



In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict disease labels on the test set
disease_pred = dt.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, disease_pred) * 100
print("Accuracy:", "{:.2f}%".format(accuracy))

# Calculate precision
precision = precision_score(y_test, disease_pred, average='weighted') * 100
print("Precision:", "{:.2f}%".format(precision))

# Calculate recall
recall = recall_score(y_test, disease_pred, average='weighted') * 100
print("Recall:", "{:.2f}%".format(recall))

# Calculate F1-score
f1 = f1_score(y_test, disease_pred, average='weighted') * 100
print("F1-Score:", "{:.2f}%".format(f1))


Accuracy: 96.67%
Precision: 96.67%
Recall: 96.67%
F1-Score: 96.67%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
# Input symptoms
input_symptoms = ['fever', 'dizziness', 'shortness of breath']

# Initialize a zero vector with the length equal to the number of unique symptoms
symptoms_vector = np.zeros(len(cols))

# Update the vector to have 1s for any symptoms that are input by the user
for symptom in input_symptoms:
    if symptom in label_encoder.classes_:  # Check if the symptom was seen during training
        # Find the index for the symptom and set it to 1
        encoded_index = label_encoder.transform([symptom])[0]
        symptoms_vector[encoded_index] = 1

# Note: The reshaping to 2D array is done because the model expects it
reshaped_vector = symptoms_vector.reshape(1, -1)

# Predict the disease using the decision tree model
predicted_disease = clf_dt.predict(reshaped_vector)

print("Predicted Disease:", predicted_disease[0])

# Checking if the prediction is in the training set is not a valid approach for evaluating accuracy.
# Instead, the accuracy should have been evaluated on the test set during model validation phase.
# Here's how you correctly calculate the accuracy on the test set (this should be done separately, not with each prediction):
accuracy = clf_dt.score(X_test, y_test)
print("Accuracy on test set:", "{:.2f}%".format(accuracy * 100))


Predicted Disease: hepatitis
Accuracy on test set: 96.67%




In [68]:
from sklearn.metrics import classification_report, confusion_matrix

# Generate classification report
print("Classification Report:")
print(classification_report(disease_real, disease_pred))

# Generate confusion matrix
print("Confusion Matrix:")
cm = confusion_matrix(disease_real, disease_pred)
print(cm)


Classification Report:
                                 precision    recall  f1-score   support

                            HIV       1.00      1.00      1.00         1
               bipolar disorder       1.00      1.00      1.00         1
                     cellulitis       1.00      1.00      1.00         1
                      cirrhosis       1.00      1.00      1.00         1
                        colitis       1.00      1.00      1.00         1
                      confusion       1.00      1.00      1.00         1
                       delirium       1.00      1.00      1.00         1
                       delusion       1.00      1.00      1.00         1
                       dementia       1.00      1.00      1.00         1
                   endocarditis       1.00      1.00      1.00         1
gastroesophageal reflux disease       1.00      1.00      1.00         1
                       glaucoma       1.00      1.00      1.00         1
                    hemipar

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
import pandas as pd
import numpy as np

# Provided classification report in a dictionary format for ease of processing
classification_report_dict = {
    "HIV": {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 1},
    "bipolar disorder": {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 1},
    # Add other diseases similarly...
    "primary malignant neoplasm": {"precision": 0.00, "recall": 0.00, "f1-score": 0.00, "support": 1},
    "sepsis (invertebrate)": {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 1},
    "transient ischemic attack": {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 1},
    "accuracy": 0.97,
    "macro avg": {"precision": 0.94, "recall": 0.94, "f1-score": 0.94, "support": 30},
    "weighted avg": {"precision": 0.97, "recall": 0.97, "f1-score": 0.97, "support": 30}
}

# Convert the classification report dictionary to DataFrame for easier analysis
report_df = pd.DataFrame(classification_report_dict).transpose()

# Display overall accuracy
print(f"Overall Accuracy: {classification_report_dict['accuracy'] * 100:.2f}%")

# Display macro and weighted average of precision, recall, f1-score
print(f"Macro Avg Precision: {classification_report_dict['macro avg']['precision'] * 100:.2f}%")
print(f"Macro Avg Recall: {classification_report_dict['macro avg']['recall'] * 100:.2f}%")
print(f"Macro Avg F1-Score: {classification_report_dict['macro avg']['f1-score'] * 100:.2f}%\n")

print(f"Weighted Avg Precision: {classification_report_dict['weighted avg']['precision'] * 100:.2f}%")
print(f"Weighted Avg Recall: {classification_report_dict['weighted avg']['recall'] * 100:.2f}%")
print(f"Weighted Avg F1-Score: {classification_report_dict['weighted avg']['f1-score'] * 100:.2f}%\n")

# Identifying diseases with perfect scores
perfect_scores = report_df[(report_df['precision'] == 1.0) & (report_df['recall'] == 1.0) & (report_df['f1-score'] == 1.0)]
print("Diseases with perfect prediction scores:\n", perfect_scores.index.tolist())

# Identifying diseases with room for improvement
improvement_needed = report_df[(report_df['precision'] < 1.0) | (report_df['recall'] < 1.0) | (report_df['f1-score'] < 1.0)]
print("\nDiseases with room for improvement:\n", improvement_needed.index.tolist())


Overall Accuracy: 97.00%
Macro Avg Precision: 94.00%
Macro Avg Recall: 94.00%
Macro Avg F1-Score: 94.00%

Weighted Avg Precision: 97.00%
Weighted Avg Recall: 97.00%
Weighted Avg F1-Score: 97.00%

Diseases with perfect prediction scores:
 ['HIV', 'bipolar disorder', 'sepsis (invertebrate)', 'transient ischemic attack']

Diseases with room for improvement:
 ['primary malignant neoplasm', 'accuracy', 'macro avg', 'weighted avg']
