# Step 1: Load Dataset
What: Loads the CSV data into a pandas DataFrame.

Why: This is the starting point for all further data processing.

Why not any other: Loading data via pandas is fast and widely used in data science.


In [None]:
!pip install nltk
!pip install pyspellchecker



Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manyl

In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
import pandas as pd

df = pd.read_csv('/content/merged_doctors_symptomss.csv')

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows
df.head()

Unnamed: 0,Disease,Disease_Category,Symptoms,CommonAgeGroup,Sex,Severity,Specialist,SyntheticTreatment,Name,Category,Address/Details,City,Rating,Mapped_Category
0,Appendicitis,surgeon,"Loss of appetite, Nausea",Child,Male,Moderate,Surgeon,Consultation and standard treatment protocols....,"Dr Muhammad Waqas Khan MBBS, FCPS, FRCS(UK) FA...",surgeon,"Urologist Metro Station New Multan Road, near ...",Kabirwala,4.6,surgeon
1,Eczema,dermatologist,"Red patches, Itchy skin, Swelling",Adult,Any,Moderate,Dermatologist,Consultation and standard treatment protocols,Dr.Salah Hayat ڈاکٹر صالح حیات,dermatologist,Doctor XXR4+4V5,Layyah,4.0,dermatologist
2,Eczema,dermatologist,"Dryness, Swelling",Teen,Female,Moderate,Dermatologist,Consultation and standard treatment protocols,Dr Muhammad Khalid- Dermatologist,dermatologist,"Skin care clinic Aizaz Clinic, Library Chowk, ...",Bahawalpur,4.5,dermatologist
3,Asthma,pulmonologist,"Cough, Chest tightness, Shortness of breath, W...",Adult,Female,Severe,Pulmonologist,"Lifestyle changes, regular medication, and per...",Dr Najeeb Clinic,pulmonologist,"Doctor Q965+3V6 Mingora, Unnamed Road",Swat,5.0,pulmonologist
4,Diabetes,endocrinologist,"Frequent urination, Fatigue",Child,Male,Mild,Endocrinologist,"Lifestyle changes, regular medication, and per...","Dr Arshad Health Associates, Karachi",endocrinologist,"Doctor No. 806, 8th Floor Kashif Center, Shahr...",Karachi,4.8,endocrinologist


In [None]:
# ==================== Step 1.: Load Dataset ====================
# Load the dataset into a pandas DataFrame

print("Dataset loaded successfully!")
print(f"Initial shape: {df.shape}")

Dataset loaded successfully!
Initial shape: (23608, 14)


# Step 2: Handle Missing Values
What: Replace missing values with defaults like "Unknown" or 0.
    
Why: Missing data can mess with the analysis, so filling it prevents errors.
    
Why not any other: Dropping rows or columns with missing data could cause loss of important information.

In [None]:

# ==================== Step 2: Handle Missing Values ====================
# Replace missing values in important columns with default values


df.fillna({'Disease': 'Unknown', 'Symptoms': 'Unknown', 'SyntheticTreatment': 'Unknown', 'Specialist': 'Unknown'}, inplace=True)
df['Rating'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(0, inplace=True)


# Step3: Text Preprocessing
What: Clean and prepare the text for modeling (convert to lowercase, remove punctuation, etc.).
    
Why: Preprocessing standardizes the text, making it easier for models to interpret.
    
Why not any other: More complex methods like stemming could be used, but lemmatization is more accurate for medical terms.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import string

class TextPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.spell_checker = SpellChecker()
        self.stop_words = set(stopwords.words('english'))
        self.synonyms = {
            'fever': 'pyrexia',
            'coughing': 'cough',
            'sneezing': 'sternutation',
            'headache': 'cephalalgia',
            'stomachache': 'abdominal pain',
            'runny nose': 'rhinorrhea',
            'vomiting': 'emesis',
            'diarrhea': 'loose stools',
            'rash': 'eruption',
            'chest pain': 'angina',
            'difficulty breathing': 'dyspnea',
            'shortness of breath': 'dyspnea',
            'high blood pressure': 'hypertension',
            'low blood pressure': 'hypotension',
            'heart attack': 'myocardial infarction',
            'stroke': 'cerebrovascular accident',
            'ear pain': 'otalgia',
            'eye redness': 'conjunctival injection',
            'sore throat': 'pharyngitis',
            'toothache': 'dental pain',
            'joint pain': 'arthralgia',
            'muscle pain': 'myalgia',
            'back pain': 'dorsalgia',
            'nausea': 'queasiness',
            'swelling': 'edema',
            'bleeding': 'hemorrhage',
            'urination pain': 'dysuria',
            'frequent urination': 'polyuria',
            'yellow skin': 'jaundice',
            'weight loss': 'cachexia',
            'weight gain': 'obesity',
            'confusion': 'delirium',
            'dizziness': 'vertigo',
            'itching': 'pruritus',
            'cold': 'common cold',
            'flu': 'influenza',
            'anxiety': 'anxiety disorder',
            'depression': 'depressive disorder',
            'constipation': 'obstipation',
            'seizures': 'convulsions',
            'fainting': 'syncope',
            'increased thirst': 'polydipsia',
            'increased hunger': 'polyphagia',
            'night sweats': 'nocturnal hyperhidrosis',
            'fatigue': 'lassitude',
            'insomnia': 'sleeplessness',
            'loss of appetite': 'anorexia',
            'abdominal swelling': 'ascites',
            'irregular heartbeat': 'arrhythmia',
            'bloody cough': 'hemoptysis',
            'blurred vision': 'visual blurring',
            'loss of balance': 'ataxia',
            'memory loss': 'amnesia',
            'tingling sensation': 'paresthesia',
            'burning sensation': 'burning dysesthesia'
        }

    def fit(self, X, y=None):
        return self  # No fitting needed

    def transform(self, X):
        return [self.preprocess_text(text) for text in X]

    def preprocess_text(self, text):
      if text is None:  # Handle None input
        return ''

      text = text.lower()  # Lowercasing
      text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
      words = word_tokenize(text)  # Tokenize

      # Remove stopwords, ensure all words are valid strings
      words = [word for word in words if word not in self.stop_words and isinstance(word, str) and word.strip()]

      # Lemmatize and perform spell correction
      words = [self.lemmatizer.lemmatize(word) for word in words]  # Lemmatize
      words = [self.spell_checker.correction(word) for word in words]  # Spell correction

      # Synonym replacement and ensure non-None words
      words = [self.synonyms.get(word, word) for word in words if word]  # Synonym replacement

      # Handle case where words list may become empty after processing
      if not words:
        return ''  # Return empty string if no valid words

      return ' '.join(words)  # Rejoin to text



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Step 4: Apply Preprocessing Pipeline
What: Apply the text cleaning steps to the 'Symptoms' and 'Disease' columns.
    
Why: This ensures all text data is cleaned and ready for modeling.
    
Why not any other: Other preprocessing methods could be used, but this pipeline is optimized for this specific dataset.

#

In [None]:
# ==================== Step 4: Apply Preprocessing Pipeline ====================
# Creating the pipeline and applying it to the relevant columns

pipeline = Pipeline([('text_preprocessor', TextPreprocessing())])
df['Symptoms'] = pipeline.fit_transform(df['Symptoms'])
df['Disease'] = pipeline.fit_transform(df['Disease'])
df['SyntheticTreatment'] = pipeline.fit_transform(df['SyntheticTreatment'])

print("Text preprocessing applied")


Text preprocessing applied to Symptoms and Disease columns.


# Step 5: Label Encoding
What: Convert categorical columns to numeric format.

Why: Machine learning models can’t process text directly, so labels must be numeric.

Why not any other: One-hot encoding is an alternative but less efficient when there are many categories.

# Step 6: Remove Duplicates
What: Remove identical rows from the dataset.
    
Why: Duplicates can bias the model and lead to overfitting.
    
Why not any other: We could de-duplicate based on certain columns, but removing all duplicates ensures data quality.

In [None]:
# ==================== Step 6: Remove Duplicates ====================
# Drop duplicate rows
df.drop_duplicates(inplace=True)
print("Duplicates removed.")


# ==================== Final Output ====================
# Check final dataset shape
print(f"Final shape: {df.shape}")
print(df.info())


Duplicates removed.
Final shape: (23608, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23608 entries, 0 to 23607
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Disease             23608 non-null  object 
 1   Disease_Category    23608 non-null  object 
 2   Symptoms            23608 non-null  object 
 3   CommonAgeGroup      23608 non-null  object 
 4   Sex                 23608 non-null  object 
 5   Severity            23608 non-null  object 
 6   Specialist          23608 non-null  object 
 7   SyntheticTreatment  23608 non-null  object 
 8   Name                23608 non-null  object 
 9   Category            23608 non-null  object 
 10  Address/Details     23608 non-null  object 
 11  City                23608 non-null  object 
 12  Rating              23608 non-null  float64
 13  Mapped_Category     23608 non-null  object 
dtypes: float64(1), object(13)
memory usage: 2.5+ MB
None


# Step 7: Save Cleaned Dataset to CSV

In [None]:


# ==================== Step 7: Save Cleaned Dataset to CSV ====================
# Save the cleaned dataset to a new CSV file
df.to_csv("pre-processed_Data.csv", index=False)
print("Cleaned dataset saved.")


Cleaned dataset saved to 'cleaned_merged_doctors_symptomss.csv'.


Batches:   0%|          | 0/1450 [00:00<?, ?it/s]

NameError: name 'embeddings' is not defined

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/738 [00:00<?, ?it/s]

Query: loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon

Top 5 similar records:
1. loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon (Distance: 0.0000)
2. loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon (Distance: 0.0000)
3. loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon (Distance: 0.0000)
4. loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon (Distance: 0.0000)
5. loss appetite queasiness appendicitis Consultation and standard treatment protocols. Stronger prescription drugs may be necessary Surgeon (Distance: 0.0000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,merged_text
0,0.067780,-0.019581,0.027891,0.005417,-0.132982,-0.064865,0.009591,0.104205,0.014645,-0.041661,...,0.007168,0.020552,-0.002789,0.035411,-0.003905,0.002368,-0.030133,-0.034819,0.032954,loss appetite queasiness appendicitis Consulta...
1,0.002449,0.084551,0.067195,-0.007846,0.057168,0.034707,0.047531,0.065230,-0.107430,-0.023815,...,-0.028436,-0.001993,0.010754,0.019279,0.032318,-0.041302,-0.094305,-0.018032,0.081371,red patch itchy skin edema eczema Consultation...
2,0.008797,0.098840,0.097749,0.034257,0.045046,-0.008098,0.008618,0.040685,-0.119651,-0.044987,...,-0.003215,0.001757,0.020641,0.021483,-0.012747,-0.034317,-0.045247,0.003779,0.051655,dryness edema eczema Consultation and standard...
3,0.074995,0.024489,-0.012961,-0.005276,-0.000430,-0.041336,-0.069398,0.015965,-0.047688,-0.078393,...,0.063118,-0.071086,-0.026408,0.050151,0.002017,-0.032620,-0.106187,-0.001149,0.088913,cough chest tightness shortness breath wheezin...
4,0.024335,0.009320,0.003181,0.063697,-0.048778,-0.046102,0.051239,0.024591,-0.037199,-0.031355,...,0.024436,-0.040005,-0.026552,0.021365,-0.017529,0.060583,-0.102572,-0.063801,-0.053244,frequent urination lassitude diabetes Lifestyl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23603,-0.036360,0.045778,-0.019514,0.074850,-0.057245,0.020185,0.009215,0.046340,-0.059838,-0.025180,...,-0.010700,0.030369,-0.079616,-0.008549,-0.002302,-0.084960,-0.117064,0.044505,0.054295,"acne weight gain co Hormonal therapy, dietary ..."
23604,0.075132,-0.005410,0.019485,-0.009364,-0.124746,-0.095822,-0.028605,0.082852,0.022597,-0.044229,...,-0.026872,-0.001606,-0.020071,0.050534,0.019404,-0.002928,-0.058327,-0.024048,0.008054,loss appetite abdominal pain appendicitis Cons...
23605,0.038670,0.109182,0.065942,0.026304,0.076845,0.011726,0.062574,0.056635,-0.159934,-0.042123,...,0.001987,0.004568,0.032321,0.011879,-0.031435,-0.034787,-0.090094,0.001895,0.055809,itchy skin dryness eczema Consultation and sta...
23606,0.066794,0.068660,0.075667,0.102919,0.083553,-0.028517,-0.079386,0.011553,-0.039923,-0.060680,...,0.052580,-0.014083,0.094410,0.013074,-0.005463,0.030873,-0.070154,0.001259,0.065820,chest pain shortness breath sweating queasines...
