# Data

In [None]:
!pip install datasets

In [3]:
from datasets import load_dataset

In [36]:
dataset = load_dataset("imdb", split = "train[:25000]")

In [37]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [38]:
df = dataset.to_pandas()
value_counts = df['label'].value_counts()
value_counts

0    12500
1    12500
Name: label, dtype: int64

# Pre Processing

## Removing special characters or HTML tags

In [39]:
from bs4 import BeautifulSoup
import re

In [40]:
#function to clean text and remove special characters and HTML tags

def clean_text(text):

    #Remove HTML tags
    bs = BeautifulSoup(text, "html.parser")
    text = bs.get_text(separator=" ")

    #remove special characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)

    #remove unecessary whitespaces
    text = " ".join(text.split())

    return text

In [42]:
dataset = dataset.map(lambda training_sample: {"text": clean_text(training_sample["text"])})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  bs = BeautifulSoup(text, "html.parser")


In [70]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

# Train-Validation Split

In [57]:
from sklearn.model_selection import train_test_split

In [61]:
features = dataset['text']
labels = dataset['label']

In [62]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

In [63]:
train_dataset = {"text": train_texts, "label": train_labels}
val_dataset = {"text": val_texts, "label": val_labels}

In [66]:
from datasets import Dataset

In [68]:
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

In [69]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 20000
})

# Knowledge Distillation

In [47]:
from transformers import BertTokenizer, BertForSequenceClassification

In [48]:
teacher = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenized_data = tokenizer(dataset["text"], truncation=True, padding=True, return_tensors="pt")