# Step 1: Load Dataset
What: Loads the CSV data into a pandas DataFrame.

Why: This is the starting point for all further data processing.

Why not any other: Loading data via pandas is fast and widely used in data science.


In [None]:
!pip install nltk
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/merged_doctors_symptomss.csv')
df.head()


Unnamed: 0,Disease,Disease_Category,Symptoms,CommonAgeGroup,Sex,Severity,Specialist,SyntheticTreatment,Name,Category,Address/Details,City,Rating,Mapped_Category
0,Diabetes,endocrinologist,"Increased thirst, Blurred vision",Child,Female,Severe,Endocrinologist,"Lifestyle changes, regular medication, and per...",Dr Arslan Shamim best diabetes stomach and liv...,endocrinologist,"Doctor G8MV+X55 Ammar Medical Complex, Jail Rd...",Lahore,4.9,endocrinologist
1,Diabetes,endocrinologist,"Increased thirst, Blurred vision",Child,Female,Severe,Endocrinologist,"Lifestyle changes, regular medication, and per...",Dr Syed Sibtain Ul Hassan (Internal Medicine S...,"internal medicine specialist, endocrinologist","Hospital Ihsan mumtaz hospital, E BLOCK",Lahore,5.0,endocrinologist
2,Diabetes,endocrinologist,"Increased thirst, Blurred vision",Child,Female,Severe,Endocrinologist,"Lifestyle changes, regular medication, and per...",Dr Muhammad Murtaza Shafqat,endocrinologist,Endocrinologist Hameed Latif Hospital,Lahore,5.0,endocrinologist
3,Diabetes,endocrinologist,"Increased thirst, Blurred vision",Child,Female,Severe,Endocrinologist,"Lifestyle changes, regular medication, and per...",Dr Awais Muhammad Butt,endocrinologist,Endocrinologist Omar Hospital and Cardiac Cent...,Lahore,5.0,endocrinologist
4,Diabetes,endocrinologist,"Increased thirst, Blurred vision",Child,Female,Severe,Endocrinologist,"Lifestyle changes, regular medication, and per...",Dr Fahad Khawaja,endocrinologist,"Endocrinologist Plot 411Block G Gulshan, e Ravi",Lahore,5.0,endocrinologist


In [None]:
# ==================== Step 1.: Load Dataset ====================
# Load the dataset into a pandas DataFrame

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/merged_doctors_symptomss.csv')
print("Dataset loaded successfully!")
print(f"Initial shape: {df.shape}")

Dataset loaded successfully!
Initial shape: (4338629, 14)


# Step 2: Handle Missing Values
What: Replace missing values with defaults like "Unknown" or 0.
    
Why: Missing data can mess with the analysis, so filling it prevents errors.
    
Why not any other: Dropping rows or columns with missing data could cause loss of important information.

In [None]:

# ==================== Step 2: Handle Missing Values ====================
# Replace missing values in important columns with default values


df.fillna({'Disease': 'Unknown', 'Symptoms': 'Unknown'}, inplace=True)
df['Rating'].fillna(0, inplace=True)
print("Missing values handled.")

Missing values handled.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(0, inplace=True)


# Step3: Text Preprocessing
What: Clean and prepare the text for modeling (convert to lowercase, remove punctuation, etc.).
    
Why: Preprocessing standardizes the text, making it easier for models to interpret.
    
Why not any other: More complex methods like stemming could be used, but lemmatization is more accurate for medical terms.

In [None]:

# ==================== Step 3: Text Preprocessing ====================
# Custom transformer class for text cleaning (lowercasing, punctuation removal, etc.)


class TextPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.spell_checker = SpellChecker()
        self.stop_words = set(stopwords.words('english'))
        self.synonyms = {'fever': 'pyrexia', 'coughing': 'cough'}  # Example synonyms for medical terms

    def fit(self, X, y=None):
        return self  # No fitting needed, so return the object itself

    def transform(self, X):
        return [self.preprocess_text(text) for text in X]

    def preprocess_text(self, text):
        text = text.lower()  # Lowercasing
        text = text.translate(str.maketrans('', '', string.punctuation))  # Punctuation removal
        words = word_tokenize(text)  # Tokenize text
        words = [word for word in words if word not in self.stop_words]  # Remove stopwords
        words = [self.lemmatizer.lemmatize(word) for word in words]  # Lemmatization
        words = [self.spell_checker.correction(word) for word in words]  # Spell correction
        words = [self.synonyms.get(word, word) for word in words]  # Synonym handling
        return ' '.join(words)  # Rejoin words into a clean string

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Step 4: Apply Preprocessing Pipeline
What: Apply the text cleaning steps to the 'Symptoms' and 'Disease' columns.
    
Why: This ensures all text data is cleaned and ready for modeling.
    
Why not any other: Other preprocessing methods could be used, but this pipeline is optimized for this specific dataset.

#

In [None]:
# ==================== Step 4: Apply Preprocessing Pipeline ====================
# Creating the pipeline and applying it to the relevant columns

pipeline = Pipeline([('text_preprocessor', TextPreprocessing())])
df['Symptoms'] = pipeline.fit_transform(df['Symptoms'])
df['Disease'] = pipeline.fit_transform(df['Disease'])
print("Text preprocessing applied to Symptoms and Disease columns.")


Text preprocessing applied to Symptoms and Disease columns.


# Step 5: Remove Duplicates
What: Remove identical rows from the dataset.
    
Why: Duplicates can bias the model and lead to overfitting.
    
Why not any other: We could de-duplicate based on certain columns, but removing all duplicates ensures data quality.

In [None]:
# ==================== Step 5: Remove Duplicates ====================
# Drop duplicate rows
df.drop_duplicates(inplace=True)
print("Duplicates removed.")


# ==================== Final Output ====================
# Check final dataset shape
print(f"Final shape: {df.shape}")
print(df.info())


Duplicates removed.
Final shape: (4338629, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4338629 entries, 0 to 4338628
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Disease             object 
 1   Disease_Category    object 
 2   Symptoms            object 
 3   CommonAgeGroup      object 
 4   Sex                 object 
 5   Severity            object 
 6   Specialist          object 
 7   SyntheticTreatment  object 
 8   Name                object 
 9   Category            object 
 10  Address/Details     object 
 11  City                object 
 12  Rating              float64
 13  Mapped_Category     object 
dtypes: float64(1), object(13)
memory usage: 463.4+ MB
None


# Step 6: Save Cleaned Dataset to CSV

In [None]:
# ==================== Step 6: Save Cleaned Dataset to CSV ====================

# Save the cleaned dataset only in your Drive
df.to_csv('/content/drive/MyDrive/Colab Notebooks/cleaned_merged_doctors_symptomss.csv', index=False)

# Print head and shape
print(df.head())
print(df.shape)


    Disease Disease_Category                         Symptoms CommonAgeGroup  \
0  diabetes  endocrinologist  increased thirst blurred vision          Child   
1  diabetes  endocrinologist  increased thirst blurred vision          Child   
2  diabetes  endocrinologist  increased thirst blurred vision          Child   
3  diabetes  endocrinologist  increased thirst blurred vision          Child   
4  diabetes  endocrinologist  increased thirst blurred vision          Child   

      Sex Severity       Specialist  \
0  Female   Severe  Endocrinologist   
1  Female   Severe  Endocrinologist   
2  Female   Severe  Endocrinologist   
3  Female   Severe  Endocrinologist   
4  Female   Severe  Endocrinologist   

                                  SyntheticTreatment  \
0  Lifestyle changes, regular medication, and per...   
1  Lifestyle changes, regular medication, and per...   
2  Lifestyle changes, regular medication, and per...   
3  Lifestyle changes, regular medication, and per...   
4  L