#### Step 1: Prepare the dataset
##### Dataset collection
Collect a dataset of anonymized patient feedback categorized by sentimentâ€”positive, neutral, and negative. Preprocessing includes cleaning, tokenizing, and splitting the data.

In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer

# Load the pre-trained BERT tokenizer for "bert-base-uncased" model
# This tokenizer converts text into tokens that the BERT model can understand,
# including handling special tokens, word piece tokenization, and vocabulary mapping
# Note: The generated tokens are specific to BERT's vocabulary and tokenization rules,
# so they're primarily designed for BERT-based models, though some other transformer models
# may use similar tokenization schemes
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create a noisy dataset
data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful.  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert dataset to a DataFrame
data = pd.DataFrame(data_dict)

# Clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data["cleaned_text"] = data["text"].apply(clean_text)

# Convert labels to integers
label_map = {"positive": 0, "neutral": 1, "negative": 2}
data["label"] = data["label"].map(label_map)

# Tokenize the cleaned text
# add_special_tokens=True: Tokenize text and add special tokens (CLS(101), SEP(102)) for model compatibility
data['tokenized'] = data['cleaned_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Pad or truncate to fixed length (e.g., 128 tokens)
data['padded_tokenized'] = data['tokenized'].apply(
    lambda x: x + [tokenizer.pad_token_id] * (128 - len(x)) if len(x) < 128 else x[:128]
)

# Preview cleaned and labeled data
print(data[['cleaned_text', 'label', 'padded_tokenized']].head())

  from .autonotebook import tqdm as notebook_tqdm


                                        cleaned_text  label  \
0  the staff was very kind and attentive to my needs      0   
1  the waiting time was too long and the staff wa...      2   
2  the doctor answered all my questionsbut the fa...      1   
3  the nurse was compassionate made me feel comfo...      0   
4  i had to wait over an hour before being seen u...      2   

                                    padded_tokenized  
0  [101, 1996, 3095, 2001, 2200, 2785, 1998, 2012...  
1  [101, 1996, 3403, 2051, 2001, 2205, 2146, 1998...  
2  [101, 1996, 3460, 4660, 2035, 2026, 3980, 8569...  
3  [101, 1996, 6821, 2001, 29353, 2081, 2033, 251...  
4  [101, 1045, 2018, 2000, 3524, 2058, 2019, 3178...  
