## Installations & imports

In [1]:
# !pip install fitz

In [2]:
!pip install --upgrade pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.0


## Extraction of text data

In [38]:
import fitz  # PyMuPDF  # to extract headings too based on fonts and layouts

symptoms data of 12 disease categories used

In [29]:
import os
import fitz  # PyMuPDF

def extract_headings_and_content(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}

    current_heading = None
    buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] != 0:
                continue

            for line in block["lines"]:
                spans = line["spans"]
                if not spans:
                    continue

                line_text = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                norm_text = line_text.strip()

                if not norm_text:
                    continue

                if (
                    len(norm_text.split()) < 10 and
                    norm_text[0].isupper() and
                    not norm_text.endswith(".")
                ):
                    if current_heading and buffer:
                        content_dict[current_heading] = " ".join(buffer).strip()
                        buffer.clear()
                    current_heading = norm_text
                    continue

                if current_heading:
                    buffer.append(norm_text)

    if current_heading and buffer:
        content_dict[current_heading] = " ".join(buffer).strip()

    return content_dict

# Main loop: go from 1 to 12
all_data = []

for i in range(1, 13):
    filename = f"{i}."
    matched_file = next((f for f in os.listdir() if f.startswith(filename) and f.endswith(".pdf")), None)

    if matched_file:
        # Extract disease category from filename (everything after the first dot, without .pdf)
        category = matched_file.split(".", 1)[1].replace(".pdf", "").strip().lower()

        # Extract content
        content_dict = extract_headings_and_content(matched_file)

        # Append with category tag
        for disease, description in content_dict.items():
            all_data.append({
                "disease": disease,
                "description": description,
                "category": category
            })

# Convert to DataFrame
import pandas as pd
disease_df = pd.DataFrame(all_data)

# Preview
disease_df.head()


Unnamed: 0,disease,description,category
0,Impetigo,Common nonbullous impetigo typically begins wi...,skin
1,"Folliculitis, furunculosis, and carbunculosis",Pustules of folliculitis usually appear in a h...,skin
2,Staphylococcal scalded skin syndrome,SSSS can usually be traced to a prodromal uppe...,skin
3,Tinea versicolor,Tinea versicolor typically produces raised or ...,skin
4,Dermatophytosis,Lesions vary in appearance depending on the si...,skin


In [30]:
disease_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   disease      316 non-null    object
 1   description  316 non-null    object
 2   category     316 non-null    object
dtypes: object(3)
memory usage: 7.5+ KB


# Pre-process content dict

In [33]:
import spacy

# Load SpaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Define additional stopwords (medical + adjectives)
domain_stopwords = {
    "disease", "disorder", "syndrome", "condition", "chronic", "acute",
    "infection", "inflammatory", "clinical", "genetic", "congenital",
    "rare", "common", "serious", "severe", "mild", "symptom", "causes",
    "associated", "involves", "affects", "typically", "usually"
}

stop_words = nlp.Defaults.stop_words.union(domain_stopwords)

def preprocess_text_spacy(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    doc = nlp(text)

    processed_tokens = [
        token.lemma_ for token in doc
        if not token.is_stop
        and token.lemma_ not in stop_words
        and token.pos_ != "ADJ"
        and token.is_alpha
    ]

    return ' '.join(processed_tokens)

# Assuming your DataFrame is called df
disease_df['description_processed'] = disease_df['description'].apply(preprocess_text_spacy)


In [34]:
disease_df.head(20)

Unnamed: 0,disease,description,category,description_processed
0,Impetigo,Common nonbullous impetigo typically begins wi...,skin,impetigo begin macule turn vesicle pustule ves...
1,"Folliculitis, furunculosis, and carbunculosis",Pustules of folliculitis usually appear in a h...,skin,pustule folliculitis appear hair follicle scal...
2,Staphylococcal scalded skin syndrome,SSSS can usually be traced to a prodromal uppe...,skin,ssss trace tract possibly purulent conjunctivi...
3,Tinea versicolor,Tinea versicolor typically produces raised or ...,skin,tinea versicolor produce raise round oval slig...
4,Dermatophytosis,Lesions vary in appearance depending on the si...,skin,lesion vary appearance depend site invasion in...
5,Scabies,"Typically, scabies causes itching, which inten...",skin,scabie cause itching intensify night lesion ex...
6,Cutaneous larva migrans,"A transient rash, tingling, or, possibly, a sm...",skin,tingling possibly vesicle appear point penetra...
7,Pediculosis,Clinical features of pediculosis capitis inclu...,skin,feature pediculosis capitis include itching ex...
8,Acne vulgaris,"The acne plug may appear as a closed comedo, o...",skin,acne plug appear comedo whitehead protrude fol...
9,Hirsutism,Hirsutism typically produces enlarged hair fol...,skin,hirsutism produce enlarge hair follicle enlarg...


# Classification Task

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## Data preparation for modelling

In [35]:
# 1. Features and target
X = disease_df['description_processed']
y = disease_df['category']

# 2. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Different Model's performance

In [41]:
# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.95)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. Define and train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "Linear SVC": LinearSVC(class_weight='balanced', random_state=42),
    "Multinomial NB": MultinomialNB()
}

# 5. Evaluate each model
for name, model in models.items():
    print(f"\n Evaluating: {name}")
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



 Evaluating: Logistic Regression
Accuracy: 0.734375
Classification Report:
                   precision    recall  f1-score   support

       endocrine       1.00      1.00      1.00         4
             ent       1.00      0.60      0.75         5
             eye       0.80      1.00      0.89         4
gastrointestinal       0.86      0.86      0.86         7
         genetic       0.50      0.75      0.60         4
          immune       1.00      0.17      0.29         6
       mneoplasm       0.44      0.57      0.50         7
      neurologic       0.80      0.80      0.80         5
           obgyn       0.71      0.83      0.77         6
           renal       1.00      0.75      0.86         4
     respiratory       1.00      0.71      0.83         7
            skin       0.56      1.00      0.71         5

        accuracy                           0.73        64
       macro avg       0.81      0.75      0.74        64
    weighted avg       0.80      0.73      0.72    