<a href="https://colab.research.google.com/github/itsdivgithub/Healthcare_Domain_Classification/blob/main/S5_BALANCED_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping from MTsamples website


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url = "https://mtsamples.com/site/pages/browse.asp"
response = requests.get(url)
data = []

soup = BeautifulSoup(response.content, 'html.parser')



ul_element = soup.find('ul', class_='list-group')


a_tags = ul_element.find_all('a')
href_list = [a_tag['href'] for a_tag in ul_element.find_all('a')]

# Print the extracted href attributes
for href in href_list:
    print('https://mtsamples.com'+href)
for parameter in href_list:

  response = requests.get('https://mtsamples.com'+parameter)

  # Parse the HTML content using BeautifulSoup
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find the table containing the data
  table = None
  for tbl in soup.find_all('table'):
    if len(tbl.find_all('tr')) > 5 and len(tbl.find_all('th')) > 0:
      table = tbl
      break

  if table is None:
    print(parameter)
    print("No suitable table found on the webpage.")
  else:
    # Extract the table headers
    label = [th.text for th in table.find_all('th')]

    # Extract the table rows

    for tr in table.find_all('tr'):

      for td in tr.find_all('td'):

        br_tag = td.find('br')
        if br_tag == None:
          continue

        data.append([br_tag.next_sibling.strip(),label[0]])


# # Create a DataFrame from the extracted data
df = pd.DataFrame(data, columns=['description', 'label'])

https://mtsamples.com/site/pages/browse.asp?type=3-Allergy / Immunology
https://mtsamples.com/site/pages/browse.asp?type=94-Autopsy
https://mtsamples.com/site/pages/browse.asp?type=5-Bariatrics
https://mtsamples.com/site/pages/browse.asp?type=6-Cardiovascular / Pulmonary
https://mtsamples.com/site/pages/browse.asp?type=99-Chiropractic
https://mtsamples.com/site/pages/browse.asp?type=97-Consult - History and Phy.
https://mtsamples.com/site/pages/browse.asp?type=70-Cosmetic / Plastic Surgery
https://mtsamples.com/site/pages/browse.asp?type=17-Dentistry
https://mtsamples.com/site/pages/browse.asp?type=18-Dermatology
https://mtsamples.com/site/pages/browse.asp?type=44-Diets and Nutritions
https://mtsamples.com/site/pages/browse.asp?type=89-Discharge Summary
https://mtsamples.com/site/pages/browse.asp?type=93-Emergency Room Reports
https://mtsamples.com/site/pages/browse.asp?type=21-Endocrinology
https://mtsamples.com/site/pages/browse.asp?type=100-ENT - Otolaryngology
https://mtsamples.com

In [None]:
df.groupby('label').size()


label
Allergy / Immunology                8
Autopsy                             8
Bariatrics                         18
Cardiovascular / Pulmonary        372
Chiropractic                       14
Consult - History and Phy.        515
Cosmetic / Plastic Surgery         27
Dentistry                          27
Dermatology                        30
Diets and Nutritions               10
Discharge Summary                 108
ENT - Otolaryngology               99
Emergency Room Reports             75
Endocrinology                      19
Gastroenterology                  230
General Medicine                  259
Hematology - Oncology              90
Hospice - Palliative Care           6
IME-QME-Work Comp etc.             16
Lab Medicine - Pathology            8
Letters                            24
Nephrology                         81
Neurology                         223
Neurosurgery                       94
Obstetrics / Gynecology           160
Office Notes                       53
Ophtha

In [None]:
df

Unnamed: 0,description,label
0,A 23-year-old white female presents with compl...,Allergy / Immunology
1,"Acute allergic reaction, etiology uncertain, h...",Allergy / Immunology
2,Mother states he has been wheezing and coughing.,Allergy / Immunology
3,Patient having severe sinusitis about two to t...,Allergy / Immunology
4,Functional endoscopic sinus surgery with left ...,Allergy / Immunology
...,...,...
5008,Normal vasectomy,Urology
5009,Normal vasectomy,Urology
5010,Desire for sterility. Vasectomy. The vas was i...,Urology
5011,Fertile male with completed family. Elective m...,Urology


# Creating balanced data

In [None]:
!pip install nlpaug
!pip install transformers
import nlpaug.augmenter.word as naw
from nlpaug.util import Action




In [None]:
# Separate the data into features and labels
X = df['description']
y = df['label']

# Define the desired number of samples per class
desired_samples = 1500

# Create empty lists for augmented data
augmented_X = []
augmented_y = []

# Iterate through each unique label
for label in y.unique():
    # Extract the samples of the current label
    samples = X[y == label]
    num_samples = len(samples)

    # Calculate the number of synthetic samples to generate
    num_synthetic_samples = desired_samples - num_samples

    # Create an augmentation instance
    augmenter = naw.SynonymAug(aug_src='wordnet')

    # Generate synthetic samples using augmentation
    for _ in range(num_synthetic_samples):
        random_sample_idx = np.random.randint(0, num_samples)
        original_text = samples.iloc[random_sample_idx]

        augmented_text = augmenter.augment(original_text)
        if augmented_text == []:
          continue
        augmented_X.append(augmented_text[0])
        augmented_y.append(label)

# Concatenate the original and augmented data
augmented_data = pd.DataFrame({'description': augmented_X, 'label': augmented_y})
balanced_data = pd.concat([df, augmented_data], ignore_index=True)

# Save the balanced data to a CSV file
balanced_data.to_csv('/content/balanced_data.csv', index=False)





In [None]:
balanced_data.groupby('label').count()

Unnamed: 0_level_0,description
label,Unnamed: 1_level_1
Allergy / Immunology,1500
Autopsy,1500
Bariatrics,1500
Cardiovascular / Pulmonary,1500
Chiropractic,1500
Consult - History and Phy.,1495
Cosmetic / Plastic Surgery,1500
Dentistry,1500
Dermatology,1500
Diets and Nutritions,1500
