In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
import pandas as pd
import matplotlib.pyplot as plt
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, pipeline, get_scheduler
from tqdm.auto import tqdm

In [None]:
df = pd.read_csv('Reddit_Data.csv')
print(df.head())
df.dropna(inplace=True)

In [None]:
plt.style.use('ggplot')

ax = df['category'].value_counts().sort_index().plot(kind='bar',figsize=(7,7))
ax.set_xlabel('Count of different sentiments')

In [None]:
redditData = Dataset.from_pandas(df)
redditData

In [None]:
# Preprocessing
pretrained_model = 'microsoft/deberta-v3-xsmall'

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def tokenize(batch):
    return tokenizer(batch['clean_comment'], truncation=True)

tokenizedData = redditData.map(tokenize, batched=True)

tokenizedData = tokenizedData.remove_columns(['clean_comment', '__index_level_0__'])
tokenizedData = tokenizedData.rename_column('category', 'labels')
tokenizedData.set_format('torch')

data = tokenizedData.train_test_split(train_size=0.8, seed=42)

# To speed up training we use data_collator to convert training samples to PyTorch tensors
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(data['train'],shuffle=True,batch_size=1,collate_fn=data_collator)
test_dataloader = DataLoader(data['test'],shuffle=True,batch_size=1,collate_fn=data_collator)

In [None]:
for batch in train_dataloader:
    break

{k:v.shape for k,v in batch.items()}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=3)

In [None]:
print(batch.get('labels'))

outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps)

print(num_training_steps)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(device)

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        #batch['labels'] = batch['labels'].detach().to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)