### Traitement de la base

In [None]:
# Import Packages
import pandas as pd
import csv
import seaborn as sns
import numpy as np
import networkx as nx
import graphviz
from IPython.display import display
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
disease_symptom_dataset_url = "http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html"

# Find table.
table = pd.read_html(disease_symptom_dataset_url)[0]

# Save CSV file
table.to_csv("disease_symptom_dataset_unprocessed.csv", index=False)

In [None]:
table.head()

Unnamed: 0,0,1,2
0,Disease,Count of Disease Occurrence,Symptom
1,UMLS:C0020538_hypertensive disease,3363,UMLS:C0008031_pain chest
2,,,UMLS:C0392680_shortness of breath
3,,,UMLS:C0012833_dizziness
4,,,UMLS:C0004093_asthenia


In [None]:
# Data Preprocessing
# Read CSV File
disease_symptom_dataset_unprocessed = pd.read_csv('disease_symptom_dataset_unprocessed.csv')

# Drop the first row
disease_symptom_dataset_unprocessed = disease_symptom_dataset_unprocessed.drop(0, axis=0)
# Reset Index
disease_symptom_dataset_unprocessed = disease_symptom_dataset_unprocessed.reset_index(drop=True)
# Rename columns
disease_symptom_dataset_unprocessed.columns=['disease', 'occurrences', 'symptoms']
# Handle NaN values
disease_symptom_dataset_unprocessed = disease_symptom_dataset_unprocessed.fillna(method='ffill')

# Function to convert   
def listToString(s):  
    
    # initialize an empty string 
    str1 = " " 
    
    # return string   
    return (str1.join(s)) 
        
# Process disease and symptoms columns
def remove_umls_code(data):
    data_ = []
    items = data.replace('^','_').split('_')
    i = 1
    for item in items:
        if (i % 2 == 0):
            data_.append(item)
        i += 1
    return data_

diseases = []
symptoms = []
occurrences = 0
disease_symptoms = defaultdict(list)
disease_symptoms_occurrences = {}
disease_symptom_df = pd.DataFrame([], columns=["diseases", "symptoms"])
for index, row in disease_symptom_dataset_unprocessed.iterrows():

    diseases = remove_umls_code(row['disease']) if (row['disease'] !="\xc2\xa0") and (row['disease'] != "") else []
    occurrences = row['occurrences'] if (row['occurrences'] !="\xc2\xa0") and (row['occurrences'] != "") else []
    symptoms = remove_umls_code(row['symptoms']) if (row['symptoms'] !="\xc2\xa0") and (row['symptoms'] != "") else []
    disease_symptom_df = disease_symptom_df.append(
        {
            "diseases": listToString(remove_umls_code(row['disease']) if (row['disease'] !="\xc2\xa0") and (row['disease'] != "") else []), 
            "symptoms":listToString(remove_umls_code(row['symptoms']) if (row['symptoms'] !="\xc2\xa0") and (row['symptoms'] != "") else [])
        }, ignore_index=True)
    for d in diseases:
            for s in symptoms:
                disease_symptoms[d].append(s)
            disease_symptoms_occurrences[d] = occurrences

# Save the cleaned dataset to a CSV file
disease_symptom_dataset_unprocessed.to_csv("disease_symptom_dataset_processed.csv")
# Save disease-symptoms list to a CSV file
disease_symptoms_data = pd.DataFrame.from_dict(disease_symptoms.items())
disease_symptoms_data.to_csv('disease_symptoms.csv')
# Save disease-symptoms-occurrences list to CSV file
disease_symptoms_occurrences_data = pd.DataFrame.from_dict(disease_symptoms_occurrences.items())
disease_symptoms_occurrences_data.to_csv('disease_occurrences.csv')


In [None]:
disease_symptoms_occurrences_data.head() 

Unnamed: 0,0,1
0,hypertensive disease,3363
1,diabetes,1421
2,depression mental,1337
3,depressive disorder,1337
4,coronary arteriosclerosis,1284


### Visualisation réseau des maladies et symptômes

In [None]:
disease_symptom_df.to_csv("disease_symptoms_network.csv")
# 0. Figure size
plt.figure(figsize=(160, 160)).set_facecolor("w")

# 1. Create the graph
disease_symptom_network = nx.from_pandas_edgelist(disease_symptom_df, source='diseases', target='symptoms')

# 2. Create a layout for our nodes 
layout = nx.spring_layout(disease_symptom_network,iterations=200)

# 3. Draw the parts we want
nx.draw_networkx_edges(disease_symptom_network, layout, edge_color='#AAAAAA')

# Symptoms
symptoms_n = [node for node in disease_symptom_network.nodes() if node in disease_symptom_df.symptoms.unique()]
symptoms_size = [disease_symptom_network.degree(node) * 200 for node in disease_symptom_network.nodes() if node in disease_symptom_df.symptoms.unique()]
nx.draw_networkx_nodes(disease_symptom_network, layout, nodelist=symptoms_n, node_size=symptoms_size, node_color='#27ae60')

# Diseases
diseases_n = [node for node in disease_symptom_network.nodes() if node in disease_symptom_df.diseases.unique()]
diseases_size = [disease_symptom_network.degree(node) * 200 for node in disease_symptom_network.nodes() if node in disease_symptom_df.diseases.unique()]
nx.draw_networkx_nodes(disease_symptom_network, layout, nodelist=diseases_n, node_size=diseases_size, node_color='#e67e22')

# Label network
symptoms_dict = dict(zip(symptoms_n, symptoms_n))
diseases_dict = dict(zip(diseases_n, diseases_n))
nx.draw_networkx_labels(disease_symptom_network, layout, labels=symptoms_dict)
nx.draw_networkx_labels(disease_symptom_network, layout, labels=diseases_dict)

# 4. Turn off the axis because I know you don't want it
plt.axis('off')

# Set the title
plt.title("DISEASE-SYMPTOMS-NETWORK")

# 5. Save the plot
plt.savefig('disease_symptom_network.png')

# 6. Tell matplotlib to show it
plt.show()



In [None]:
cleaned_data = pd.DataFrame()
cleaned_data.cloumns = ['diseases', 'symptoms', 'occurrences']
cleaned_data['diseases'] = disease_symptoms_data[0]
cleaned_data['symptoms'] = disease_symptoms_data[1]
cleaned_data['occurrences'] = disease_symptoms_occurrences_data[1]
cleaned_data.head()


  


Unnamed: 0,diseases,symptoms,occurrences
0,hypertensive disease,"[pain chest, shortness of breath, dizziness,...",3363
1,diabetes,"[polyuria, polydypsia, shortness of breath, p...",1421
2,depression mental,"[feeling suicidal, suicidal, hallucinations ...",1337
3,depressive disorder,"[feeling suicidal, suicidal, hallucinations ...",1337
4,coronary arteriosclerosis,"[pain chest, angina pectoris, shortness of ...",1284


In [None]:
multi_label_binarizer = MultiLabelBinarizer()
symptoms_encoded = pd.DataFrame(multi_label_binarizer.fit_transform(cleaned_data['symptoms']), columns=multi_label_binarizer.classes_, index=cleaned_data['symptoms'].index)
dataset = pd.concat([cleaned_data['diseases'], symptoms_encoded], axis=1)
dataset = dataset.drop([''], axis=1)
dataset.to_csv('final_disease_symptom_data1.csv')
dataset.head()

Unnamed: 0,diseases,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abnormally hard consistency.1,...,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum,yellow sputum.1
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,diabetes,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,depression mental,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,depressive disorder,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,coronary arteriosclerosis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Using split
y = dataset['diseases'].values
X = dataset.drop('diseases', axis = 1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X_train, y_train)
decision_tree.score(X_test, y_test)

# Won't work because the symptoms for each diseases are unique. The model has never seen the test data. Hence, the score is 0.
# To Do: Find a way to efficiently split the dataset

0.0

### Estimation des modèles

In [None]:
# Decision Tree Classifier
# Using split
y = dataset['diseases'].values
X = dataset.drop('diseases', axis = 1).values
print(y)
decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X, y)
decision_tree.score(X, y)

['hypertensive  disease' 'diabetes' 'depression  mental'
 'depressive disorder' 'coronary  arteriosclerosis'
 'coronary heart disease' 'pneumonia' 'failure  heart congestive'
 'accident  cerebrovascular' 'asthma' 'myocardial  infarction'
 'hypercholesterolemia' 'infection' 'infection  urinary tract' 'anemia'
 'chronic  obstructive airway disease' 'dementia' 'insufficiency  renal'
 'confusion' 'degenerative  polyarthritis' 'hypothyroidism'
 'anxiety  state' 'malignant  neoplasms' 'primary malignant neoplasm'
 'acquired  immuno-deficiency  syndrome' 'HIV' 'hiv infections'
 'cellulitis' 'gastroesophageal  reflux disease' 'septicemia'
 'systemic  infection' 'sepsis (invertebrate)' 'deep  vein thrombosis'
 'dehydration' 'neoplasm' 'embolism  pulmonary' 'epilepsy'
 'cardiomyopathy' 'chronic  kidney failure' 'carcinoma' 'hepatitis  C'
 'peripheral  vascular disease' 'psychotic  disorder' 'hyperlipidemia'
 'bipolar  disorder' 'obesity' 'ischemia' 'cirrhosis' 'exanthema'
 'benign  prostatic hyp

0.9054054054054054

In [None]:
features = np.asarray(dataset.columns)
features = np.delete(features, 0)
classes = np.asarray(dataset['diseases'])
len(features)

445

In [None]:
text_representation = tree.export_text(decision_tree, feature_names=features.tolist())
print(text_representation)

|--- underweight <= 0.50
|   |--- feeling hopeless <= 0.50
|   |   |--- weepiness <= 0.50
|   |   |   |--- hydropneumothorax <= 0.50
|   |   |   |   |--- passed stones <= 0.50
|   |   |   |   |   |--- clammy skin <= 0.50
|   |   |   |   |   |   |--- angina pectoris <= 0.50
|   |   |   |   |   |   |   |--- hypoxemia <= 0.50
|   |   |   |   |   |   |   |   |--- productive cough <= 0.50
|   |   |   |   |   |   |   |   |   |--- hyperkalemia <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- stupor <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 36
|   |   |   |   |   |   |   |   |   |   |--- stupor >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- hyperkalemia >  0.50
|   |   |   |   |   |   |   |   |   |   |--- orthopnea <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- orthopnea >  0.50
|   |   |  

In [None]:
dot_data = tree.export_graphviz(decision_tree, out_file=None, feature_names=features.tolist(), class_names=classes.tolist(), filled=True)
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_graphivz")

In [None]:
y_pred = decision_tree.predict(X)

for i in range(0, len(y)):
    if y_pred[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(y_pred[i], y[i]))


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

labelencoder = LabelEncoder()
ye = labelencoder.fit_transform(y)


In [None]:
from sklearn.linear_model import LogisticRegression
Logit = LogisticRegression()
Logit.fit(X, y)
y_predl=Logit.predict(X)
#print(classification_report(y_predl, y))
print(Logit.score(X, y))
for i in range(0, len(y)):
    if y_predl[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(y_predl[i], y[i]))

In [None]:
from sklearn import svm
SVM = svm.SVC()
SVM.fit(X, y)
print(SVM.score(X, y))

In [None]:
from sklearn.linear_model import SGDClassifier
SDG = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=5)
SDG.fit(X, y)
print(SDG.score(X, y))

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=10)
RF.fit(X, y)
print(RF.score(X, y))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
Ada = AdaBoostClassifier(n_estimators=100)
Ada.fit(X,y)
print(Ada.score(X, y))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
Gboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    random_state=0).fit(X, y)
Gboost.score(X, y)

In [None]:
data=pd.read_csv("/content/sample_data/final_disease_symptom_data.csv", sep=';')
data.head()

In [None]:
y = data['diseases'].values
X = data.drop('diseases', axis = 1).values
DT = DecisionTreeClassifier()
DT = DT.fit(X, y)
DT.score(X, y)

In [None]:
y_pred = DT.predict(X)
print(classification_report(y_pred, y))

for i in range(0, len(y)):
    if y_pred[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(y_pred[i], y[i]))

print(DT.predict_proba(X))

In [None]:
from sklearn.linear_model import LogisticRegression
Logit = LogisticRegression()
Logit.fit(X, y)
y_predl=Logit.predict(X)
print(classification_report(y_predl, y))
print(Logit.score(X, y))
print(classification_report(y_predl, y))
for i in range(0, len(y)):
    if y_predl[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(y_predl[i], y[i]))
print(max(Logit.predict_proba(X)[5]))

In [None]:
SVM = svm.SVC(probability=True)
SVM.fit(X, y)
print(SVM.score(X, y))
ySVM=SVM.predict(X)
print(classification_report(ySVM, y))
for i in range(0, len(y)):
    if ySVM[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(ySVM[i], y[i]))
print(SVM.predict_proba(X))

In [None]:
from sklearn.linear_model import SGDClassifier
SDG = SGDClassifier(loss="hinge", penalty="l1", max_iter=5)
SDG.fit(X, y)
print(SDG.score(X, y))
ySDG=SDG.predict(X)
print(classification_report(ySDG, y))
for i in range(0, len(y)):
    if ySDG[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(ySDG[i], y[i]))


In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=10)
RF.fit(X, y)
print(RF.score(X, y))
yRF=RF.predict(X)
print(classification_report(yRF, y))
for i in range(0, len(y)):
    if yRF[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(yRF[i], y[i]))
print(RF.predict_proba(X))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
Ada = AdaBoostClassifier(n_estimators=100)
Ada.fit(X,y)
print(Ada.score(X, y))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
Gboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    random_state=0).fit(X, y)
print(Gboost.score(X, y))
yGboo=Gboost.predict(X)
print(classification_report(yGboo, y))
for i in range(0, len(y)):
    if yGboo[i]!=y[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(yGboo[i], y[i]))
print(Gboost.predict_proba(X))

### Benchmark des différents modèle

In [None]:
Benchmark=pd.DataFrame({ "Algorithme":["Decision Tree", "Regression Logistique", "SVM", "SDG Classifier","Random Forest", "Gradient Boosting"],
                         "Acurracy":[ 0.95, 0.95, 0.94, 0.94, 0.95, 0.95],
                         "Recall":[0.93, 0.93, 0.92, 0.91, 0.93, 0.93],
                         "F1-score": [0.93, 0.93, 0.93, 0.92, 0.93, 0.93]    
                        }
                       )
Benchmark

### Extraction des symptômes et similarité semantique

In [None]:
! pip install torch

In [None]:
! pip install transformers

In [None]:
! pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
model = SentenceTransformer('stsb-roberta-large')

In [None]:
def similarity_fun(description, symptoms=features):
  # encode symptoms to get corpus embeddings
  feature_embeddings = model.encode(symptoms, convert_to_tensor=True)
  # encode sentence to get sentence embeddings
  description_embeddings= model.encode(description, convert_to_tensor=True) 
  # top_k results to return
  top_k=1
  # compute similarity scores of the sentence with the corpus
  symptoms_extrait=[]
  symptoms_indice=[]
  X=np.zeros(len(symptoms))
  cos_scores = util.pytorch_cos_sim(description_embeddings,  feature_embeddings)
  for i,cos_score in enumerate(cos_scores):
    # Sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_score, range(top_k))[0:top_k]
    print("Sentence:", description[i], "\n")
    print("Top", top_k, "most similar sentences in corpus:")
    for idx in top_results[0:top_k]:
        print(symptoms[idx], "(Score: %.4f)" % (cos_score[idx]))
        symptoms_extrait.append(symptoms[idx])
        symptoms_indice.append(idx)
        X[idx]=1
    print('----------------------------------------')
  return symptoms_extrait, symptoms_indice, X

In [None]:
desc=[
      "I have congestion",
      "she has been suctioning yellow discharge from the patient's nares", 
      "she has noticed some mild problems with his breathing while feeding",
      "also noticed a tactile temperature and gave the patient Tylenol",
      "respiratory congestion",
      "fatigue ",
      "fussy over the past 2 days",
      "His urine output has also decreased",
      "diarrhea",
      "His bowel movements are yellow colored and soft in nature"
]

In [None]:
symptom, sympindice, x_to_pred =similarity_fun(desc)
print(symptom)
print(x_to_pred)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
ye = labelencoder.fit_transform(y)


In [None]:

from sklearn.linear_model import LogisticRegression
Logit = LogisticRegression()
Logit.fit(X, ye)
y_predl=Logit.predict(X)
print(classification_report(y_predl, ye))
print(Logit.score(X, ye))
print(classification_report(y_predl, ye))
for i in range(0, len(y)):
    if y_predl[i]!=ye[i]:
        print ('Predicted: {0} --------- Actual:{1}'.format(y_predl[i], ye[i]))
print(max(Logit.predict_proba(X)[5]))

In [None]:
y_predxl=labelencoder.inverse_transform(Logit.predict([x_to_pred]))
print(y_predxl)
prob=Logit.predict_proba([x_to_pred])
#print(prob[0])
resumal= np.argpartition(-prob[0], range(5))[:5]
#print(resumal)
Dis=labelencoder.inverse_transform(resumal)
#print(Dis)
for i,j in zip(Dis,resumal):
  print(i,' : ', round(prob[0][j]*100, 2),'%')

In [None]:
import pickle
pkl_filename="pickle_model.pkl"
with open(pkl_filename,'wb') as file:
  pickle.dump(Logit, file)

#with open(pkl_filename,'rb') as file:
#  Logit_model=pickle.load(Logit, file)

In [None]:
# Implementing the Visual Tree
from sklearn.tree import _tree
from sklearn import tree
#tree.plot_tree(classifier)

### Modèle basé sur la description

In [None]:
def preprocess(textDoc):
    corpus_lemetized=[]
    
    #creating a Lemmatizer
    lemmmatizer=WordNetLemmatizer()#define the imported library
  
    for text in textDoc:
        text = text.lower() 
        text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', "", text) #  Remove special html characters such as website link, http/https/www
        text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', "", text) # Remove Punctuations special characters such as #, $, %
        text = re.sub("\d+", "", text) # Remove any numerical values present in the dataset
    
        words = word_tokenize(text)
        words = [lemmmatizer.lemmatize(word.lower()) for word in words if(not word in set(stopwords.words('english')) and  word.isalpha())]
        corpus_lemetized.append(words)
        
    return  corpus_lemetized        

In [None]:
desc_data=pd.read_csv("/content/sample_data/Disease_Description_ICD11.csv", encoding = 'ISO-8859-1', sep=';')
desc_data.head()

In [None]:
from networkx.algorithms.traversal.breadth_first_search import descendants_at_distance
import re
def preprocess_text(text):
  text = text.encode("ascii", errors="ignore").decode("ascii")  # remove non-ascii, Chinese characters
  text = text.lower()
    # lower case
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"\t", " ", text)
  text = re.sub("\d+", "", text) 
  
  return text


In [None]:
data_description=list(desc_data['Description'])
data_disease=list(desc_data['Disease '])
desc_clean=[]
for i in data_description:
  desc_clean.append(preprocess_text(i))
print(len(data_disease))
print(len(desc_clean))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')
X_desc=vectorizer.fit_transform(desc_clean)
X_data=X_desc.toarray()
print(len(X_data))

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

classifier_svm =LinearSVC()
classifier_svm.fit(X_desc, data_disease)
desc_disease_predict=classifier_svm.predict(X_desc)
print(classification_report(data_disease,desc_disease_predict))
scores=classifier_svm.score(X_desc,data_disease)
print(scores)

In [None]:
from sklearn.linear_model import LogisticRegression
Log = LogisticRegression()
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data_disea= labelencoder.fit_transform(data_disease)
Log = LogisticRegression()
Log.fit(X_desc, data_disea)
desc_disease_predict=Log.predict(X_desc)
#print(classification_report(data_disease,desc_disease_predict))
scores=Log.score(X_desc,data_disea)
print(scores)
#print(Log.predict_proba(X_desc)[5])

In [None]:
y_predxl=labelencoder.inverse_transform(Log.predict(x_ech))
print(y_predxl)
prob=Log.predict_proba(x_ech)
print(prob[0])
resumal= np.argpartition(-prob[0], range(5))[:5]
print(resumal)
Dis=labelencoder.inverse_transform(resumal)
print(prob[0][resumal])
print(Dis)
for i,j in zip(Dis,resumal):
  print(i,' : ', round(prob[0][j]*100, 2),'%')

In [None]:
a=''
for i in desc:
  a=a + ' ' + i
print(a)
x_ech=vectorizer.transform([a])
Log.predict(x_ech)

In [None]:

#feature_embeddings = model.encode(desc_clean, convert_to_tensor=True)
#print(feature_embeddings)
  # encode sentence to get sentence embeddings
#description_embeddings= model.encode(a, convert_to_tensor=True) 
  # top_k results to return
top_k=5
  # compute similarity scores of the sentence with the corpus
symptoms_extrait=[]
symptoms_indice=[]
X=np.zeros(len(symptoms))
cos_scores = util.pytorch_cos_sim(description_embeddings,  feature_embeddings)

for i,cos_score in enumerate(cos_scores):
    # Sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_score, range(top_k))[0:top_k]
    print("Top", top_k, "most similar sentences in corpus:")
    for idx in top_results[0:top_k]:
        print(data_disease[idx], "(Score: %.4f)" % (cos_score[idx]))

### Projet S2

In [None]:
import csv
description_list = dict()
def getDescription():
    global description_list
    with open("/content/sample_data/Disease_Description_ICD11.csv",encoding = 'ISO-8859-1') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        line_count = 0
        for row in csv_reader:
            _description={row[0]:row[1]}
            description_list.update(_description)

getDescription()
print(description_list)

In [None]:
! pip install gradio

In [None]:
import gradio as gr
import re

with open(pkl_filename,'rb') as file:
  Logit_model=pickle.load(file)

def clean_text(text):
  text = text.encode("ascii", errors="ignore").decode("ascii")  # remove non-ascii, Chinese characters
  text = text.lower()
    # lower case
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"\t", " ", text)
  text = re.sub("\d+", "", text) 
  
  return text.split(",")

def Disease_classifier(text):
  
  description=clean_text(text)

  symptom, sympindice, x_to_pred =similarity_fun(description)

  Symptoms=set(symptom)

  Disease=labelencoder.inverse_transform(Logit_model.predict([x_to_pred]))

  Desc=description_list[Disease[0]]

  prob=Logit_model.predict_proba([x_to_pred])

  resumal= np.argpartition(-prob[0], range(5))[:5]

  Dis=labelencoder.inverse_transform(resumal)
  
  Prob=[]
  for j in  resumal: 
    Prob.append(round(prob[0][j]*100, 2))
  
  Disease_alternatif=pd.DataFrame(zip(Dis,Prob), columns=["Disease","Probability %"])

  return  Symptoms, Disease[0], Prob[0], Desc, Disease_alternatif




### Gradio interface

In [None]:
gradio_ui = gr.Interface(
    fn=Disease_classifier,
    title="Prediction of Disease",
    description="Enter the symptoms or description of how you feel yourself  ",
    inputs=gr.inputs.Textbox(lines=10, label="Paste some text here"),
    outputs=[
        gr.outputs.Textbox(label="Symptom extracted"),
        gr.outputs.Textbox(label="Disease predicted"),
        gr.outputs.Textbox(label="Disease Score"),
        gr.outputs.Textbox(label="Disease description"),
        gr.outputs.Textbox(label="Alternative disease that it can be")
    ],
)


In [None]:
gradio_ui.launch()