In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("mayo_diseases.csv")

In [7]:
text = df["Symptoms"][0]

In [8]:
import re
import spacy

class Text_Preprocessing:
    
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
    
    def work_with_extras(self, text: str) -> list:
        clean_text = re.sub(r"[\n.,:]", " ", text)
        return clean_text.split()
    
    def work_with_spaces(self, text: list) -> list:
        return [word.strip().lower() for word in text if word.strip()]
    
    def process_text(self, text: list) -> list:
        # Join list into a string before passing to spaCy
        doc = self.nlp(" ".join(text))
        return [
            token.lemma_
            for token in doc
            if not token.is_stop
            and not token.is_punct
            and not token.like_num
            and token.is_alpha
        ]
        
    def go_on(self, text: str) -> list:
        text = self.work_with_extras(text)
        text = self.work_with_spaces(text)
        return self.process_text(text)


In [9]:
text

"Symptoms ofAFibmay include:\nFeelings of a fast, fluttering or pounding heartbeat, called palpitations.\nChest pain.\nDizziness.\nFatigue.\nLightheadedness.\nReduced ability to exercise.\nShortness of breath.\nWeakness.\nSome people with atrial fibrillation (AFib) don't notice any symptoms.\nAtrial fibrillation may be:\nOccasional, also called paroxysmal atrial fibrillation.AFibsymptoms come and go. The symptoms usually last for a few minutes to hours. Some people have symptoms for as long as a week. The episodes can happen repeatedly. Symptoms might go away on their own. Some people with occasionalAFibneed treatment.\nPersistent.The irregular heartbeat is constant. The heart rhythm does not reset on its own. If symptoms occur, medical treatment is needed to correct the heart rhythm.\nLong-standing persistent.This type ofAFibis constant and lasts longer than 12 months. Medicines or a procedure are needed to correct the irregular heartbeat.\nPermanent.In this type of atrial fibrillatio

In [10]:
t =  Text_Preprocessing()
pp = t.go_on(text)

In [11]:
pp

['symptom',
 'ofafibmay',
 'include',
 'feeling',
 'fast',
 'fluttering',
 'pound',
 'heartbeat',
 'call',
 'palpitation',
 'chest',
 'pain',
 'dizziness',
 'fatigue',
 'lightheadedness',
 'reduce',
 'ability',
 'exercise',
 'shortness',
 'breath',
 'weakness',
 'people',
 'atrial',
 'fibrillation',
 'afib',
 'notice',
 'symptom',
 'atrial',
 'fibrillation',
 'occasional',
 'call',
 'paroxysmal',
 'atrial',
 'fibrillation',
 'afibsymptom',
 'come',
 'symptom',
 'usually',
 'minute',
 'hour',
 'people',
 'symptom',
 'long',
 'week',
 'episode',
 'happen',
 'repeatedly',
 'symptom',
 'away',
 'people',
 'occasionalafibneed',
 'treatment',
 'persistent',
 'irregular',
 'heartbeat',
 'constant',
 'heart',
 'rhythm',
 'reset',
 'symptom',
 'occur',
 'medical',
 'treatment',
 'need',
 'correct',
 'heart',
 'rhythm',
 'long',
 'stand',
 'persistent',
 'type',
 'ofafibis',
 'constant',
 'last',
 'long',
 'month',
 'medicine',
 'procedure',
 'need',
 'correct',
 'irregular',
 'heartbeat',
 'per

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

class RemoveUselessWords:
    
    def __init__(self):
        self.model_name = "d4data/biomedical-ner-all"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        self.ner = pipeline("ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="simple")
        
        # Define entity types you WANT to KEEP
        self.allowed_entities = [
            "Sign_symptom",
            "Disease_disorder",
            "Biological_structure",
            "Medication",
            "Therapeutic_procedure",
            "Duration"
        ]
        
    def process_entities(self, words: list) -> list:
        results = []
        for word in words:
            entity = self.ner(word)
            if entity:
                ent_type = entity[0]['entity_group']
                if ent_type in self.allowed_entities:
                    results.append(word)
        return results


In [13]:
processor = RemoveUselessWords()
filtered = processor.process_entities(pp)
print(filtered)


Device set to use cpu


['symptom', 'ofafibmay', 'fluttering', 'palpitation', 'chest', 'pain', 'dizziness', 'fatigue', 'lightheadedness', 'shortness', 'weakness', 'atrial', 'fibrillation', 'symptom', 'atrial', 'fibrillation', 'atrial', 'fibrillation', 'afibsymptom', 'symptom', 'symptom', 'symptom', 'treatment', 'heart', 'reset', 'symptom', 'treatment', 'heart', 'ofafibis', 'medicine', 'atrial', 'fibrillation', 'heart', 'reset', 'medicine', 'heart', 'blood']


In [15]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


In [16]:
final = " ".join(filtered)

In [17]:

inputs = tokenizer(final, return_tensors="pt")
outputs = model(**inputs)
word_embeddings = outputs.last_hidden_state 


In [20]:
len(filtered)

37

In [19]:
word_embeddings.shape

torch.Size([1, 80, 768])

In [2]:
import os

# Folders to exclude
EXCLUDED_FOLDERS = {'myenv', 'scipy', '__pycache__', '.git','skin-disease-datasaet'}

def print_tree(start_path, indent=""):
    for item in sorted(os.listdir(start_path)):
        item_path = os.path.join(start_path, item)
        if item in EXCLUDED_FOLDERS:
            continue
        if os.path.isdir(item_path):
            print(f"{indent}📁 {item}")
            print_tree(item_path, indent + "    ")
        else:
            print(f"{indent}📄 {item}")

# Replace '.' with your desired root directory
print_tree('.')


📄 .gitattributes
📄 .gitignore
📄 LICENSE
📄 README.md
📁 Vector
    📁 symptom_faiss_db
📁 backend
    📁 api
        📄 upload.py
    📄 config.py
    📄 main.py
    📁 models
        📄 skin_disease_model.h5
    📁 services
        📄 evaluate_rag_symptom.py
        📄 image_classifier.py
        📄 symptom_to_disease.py
    📁 utils
        📄 filtering_with_ner.py
        📄 image_preprocessing.py
        📄 text_cleaning.py
📁 data
    📁 Vector
        📁 symptom_faiss_db
            📄 index.faiss
            📄 index.pkl
    📄 labels.json
    📄 test_symptom_cases.csv
📁 evaluation
    📄 rag_model_score.txt
📁 frontend
    📄 index.html
📁 notebook
    📄 llm.ipynb
    📄 mayo_diseases.csv
    📄 skin_disease_prediction.ipynb
    📄 test_symptom_cases.csv
    📄 web_scrapping.ipynb
📄 requirements.txt
📁 scripts
    📁 scrapers
        📄 main.py
        📄 web_scraper2.py
        📄 web_scrapers.py
    📄 symptoms_to_vectordb.py
📄 transfaer.ipynb
