In [74]:
import pandas as pd
import yaml
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

#visualizatin
import seaborn as sn
import matplotlib.pyplot as plt

#model building
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [79]:
class PredictDisease:
    
    def __init__(self,model_name=None):
        #loading the yaml config and initializing the model:
        path=r"C:\Users\dell\Desktop\projects\Phillps_hac\config.yaml"
        try: 
            with open (path, 'r') as file:
                self.config = yaml.safe_load(file)
        except Exception as e:
            print('Error reading the config file')
    
        #defining the parameters from config file:
        self.verbose=self.config['verbose']
    
        #loading train parameters :
        self.train_feat,self.train_labels, self.train_df=self._loading_train()
    
        #loading test parameters :
        self.test_feat,self.test_labels, self.test_df=self._loading_test()
        
        self.model_name=model_name
        
        #path for saving the model
        self.model_save_path = self.config['model_save_path']
        
        #saving data visualization 
       # self._feature_correlation(data_frame=self.train_df, show_fig=False)
        
    #function to load train dataset:
    def _loading_train(self):
        df_train=pd.read_csv(self.config['dataset']['training_data_path'])
        #the last column is not needed and the 2nd last column gives us the y-labels 
        columns=df_train.columns[:-2]
        
        #the training feature vectors
        train_feat=df_train[columns]
        
        #the y labels for train set
        train_labels=df_train['prognosis']
        
        if self.verbose:
            print("Length of Training Data: ", df_train.shape)
            print("Training Features: ", train_feat.shape)
            print("Training Labels: ", train_labels.shape)
            
        return train_feat, train_labels, df_train
    
    #function to load the test data 
    def _loading_test(self):
        df_test=pd.read_csv(self.config['dataset']['test_data_path'])
        #the last column is not needed and the 2nd last column gives us the y-labels 
        columns=df_test.columns[:-1]
        
        #the training feature vectors
        test_feat=df_test[columns]
        
        #the y labels for train set
        test_labels=df_test['prognosis']
        
        if self.verbose:
            print("Length of Training Data: ", df_test.shape)
            print("Training Features: ", test_feat.shape)
            print("Training Labels: ", test_labels.shape)
            
        return test_feat, test_labels, df_test
    
    #model selection 
    def select_model(self):
        if self.model_name == 'mnb':
            self.clf = MultinomialNB()
        elif self.model_name == 'decision_tree':
            self.clf = DecisionTreeClassifier(criterion=self.config['model']['decision_tree']['criterion'])
        elif self.model_name == 'random_forest':
            self.clf = RandomForestClassifier(n_estimators=self.config['model']['random_forest']['n_estimators'])
        elif self.model_name == 'gradient_boost':
            self.clf = GradientBoostingClassifier(n_estimators=self.config['model']['gradient_boost']['n_estimators'],
                                                  criterion=self.config['model']['gradient_boost']['criterion'])
        return self.clf
    
    # Dataset Train Validation Split
    def _train_val_split(self):
        X_train, X_val, y_train, y_val = train_test_split(self.train_feat, self.train_labels,
                                                          test_size=self.config['dataset']['validation_size'],
                                                          random_state=self.config['random_state'])

        if self.verbose:
            print("Number of Training Features: {0}\tNumber of Training Labels: {1}".format(len(X_train), len(y_train)))
            print("Number of Validation Features: {0}\tNumber of Validation Labels: {1}".format(len(X_val), len(y_val)))
            
        return X_train, y_train, X_val, y_val
    
    # ML Model
    def train_model(self):
        # Get the Data
        X_train, y_train, X_val, y_val = self._train_val_split()
        classifier = self.select_model()
        # Training the Model
        classifier = classifier.fit(X_train, y_train)
        # Trained Model Evaluation on Validation Dataset
        confidence = classifier.score(X_val, y_val)
        # Validation Data Prediction
        y_pred = classifier.predict(X_val)
        # Model Validation Accuracy
        accuracy = accuracy_score(y_val, y_pred)
        # Model Confusion Matrix
        conf_mat = confusion_matrix(y_val, y_pred)
        # Model Classification Report
        clf_report = classification_report(y_val, y_pred)
        # Model Cross Validation Score
        score = cross_val_score(classifier, X_val, y_val, cv=3)
        
        
        if self.verbose:
            print('\nTraining Accuracy: ', confidence)
            print('\nValidation Prediction: ', y_pred)
            print('\nValidation Accuracy: ', accuracy)
            print('\nValidation Confusion Matrix: \n', conf_mat)
            print('\nCross Validation Score: \n', score)
            print('\nClassification Report: \n', clf_report)
            
        # Save Trained Model
        filename=r"C:\Users\dell\Desktop\projects\Phillps_hac\saved_model"+str(self.model_name)
        joblib.dump(classifier, filename + ".joblib")
        
    def _feature_correlation(self, data_frame=None, show_fig=False):
        # Get Feature Correlation
        corr = data_frame.corr()
        fig, ax = plt.subplots()
        fig.set_size_inches(14, 14)
        sn.heatmap(corr, square=True, annot=False, cmap="YlGnBu")
        plt.title("Feature Correlation")
        plt.tight_layout()
        #plt.show()
        plt.savefig(r'C:\Users\dell\Desktop\projects\Phillps_hac\feature_correlation.png')
    
    # Function to Make Predictions on Test Data
    def make_prediction(self, saved_model_name=None, test_data=None):
        try:
            # Load Trained Model
            filename=r"C:\Users\dell\Desktop\projects\Phillps_hac\saved_model"+str(self.model_name)
            clf = joblib.load(filename +  ".joblib")
        except Exception as e:
            print("Model not found...")

        if test_data is not None:
            result = clf.predict(test_data)
            return result
        else:
            result = clf.predict(self.test_feat)
            
        accuracy = accuracy_score(self.test_labels, result)
        clf_report = classification_report(self.test_labels, result)
        return accuracy,clf_report
        
    

In [95]:
if __name__=='__main__':
    print("Available models are as follows :")
    print("Press 1 for Decision Trees")
    print("Press 2 for Multinomial NB")
    print("Press 3 for RandomForestClassifier")
    print("Press 4 for GradientBoostingClassifier")
    x=int(input())
    if x==1:
        _model_name="decision_tree"
    elif x==2:
        _model_name="mnb"
    elif x==3:
        _model_name="random_forest"
    elif x==4:
        _model_name="gradient_boost"
    else :
        print("please choose valid number between 1 to 4")
    
    
    disease_model=PredictDisease(model_name=_model_name)
    
    #train the model
    disease_model.train_model()
    
    #make predictions on test data:
    test_acc,clf_report=disease_model.make_prediction(saved_model_name=_model_name)

Available models are as follows :
Press 1 for Decision Trees
Press 2 for Multinomial NB
Press 3 for RandomForestClassifier
Press 4 for GradientBoostingClassifier
1
Length of Training Data:  (4920, 134)
Training Features:  (4920, 132)
Training Labels:  (4920,)
Length of Training Data:  (42, 133)
Training Features:  (42, 132)
Training Labels:  (42,)
Number of Training Features: 3936	Number of Training Labels: 3936
Number of Validation Features: 984	Number of Validation Labels: 984

Training Accuracy:  1.0

Validation Prediction:  ['Migraine' 'Common Cold' 'Hepatitis B' 'Gastroenteritis'
 'Alcoholic hepatitis' 'Heart attack' 'Hepatitis E' 'Hypothyroidism'
 'Malaria' 'Gastroenteritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Osteoarthristis' 'Impetigo' 'Hepatitis D' 'Bronchial Asthma' 'Impetigo'
 'Alcoholic hepatitis' 'Alcoholic hepatitis'
 'Paralysis (brain hemorrhage)' 'Typhoid' 'Bronchial Asthma'
 '(vertigo) Paroymsal  Positional Vertigo' 'Malaria' 'Hepatitis B'
 'Typhoid' 'AIDS' 'Hy

In [100]:
test_acc

0.9761904761904762

In [27]:
test_=pd.read_csv(r"C:\Users\dell\Desktop\projects\Phillps_hac\Disease-Prediction-from-Symptoms-master\dataset\test_data.csv")

In [97]:
#To find the predicted diseases from the test dataset : 
for i in range(test_.shape[0]):
    print("The symptoms are as follows ",end=':')
    for j in range(test_.shape[1]):
        if(test_.iloc[i][j]==1):
            print(test_.columns[j],end=" ,")
    print("")        
    print("The predicted disease is :", result[i])        
    print("\n")       

The symptoms are as follows :itching ,skin_rash ,nodal_skin_eruptions ,dischromic _patches ,
The predicted disease is : Fungal infection


The symptoms are as follows :continuous_sneezing ,shivering ,chills ,watering_from_eyes ,
The predicted disease is : Allergy


The symptoms are as follows :stomach_pain ,acidity ,ulcers_on_tongue ,vomiting ,cough ,chest_pain ,
The predicted disease is : GERD


The symptoms are as follows :itching ,vomiting ,yellowish_skin ,nausea ,loss_of_appetite ,abdominal_pain ,yellowing_of_eyes ,
The predicted disease is : Chronic cholestasis


The symptoms are as follows :itching ,skin_rash ,stomach_pain ,burning_micturition ,spotting_ urination ,
The predicted disease is : Drug Reaction


The symptoms are as follows :vomiting ,indigestion ,loss_of_appetite ,abdominal_pain ,passage_of_gases ,internal_itching ,
The predicted disease is : Peptic ulcer diseae


The symptoms are as follows :muscle_wasting ,patches_in_throat ,high_fever ,extra_marital_contacts ,
The