# Approach:Transfer Learning

In [10]:
!pip install datasets



In [11]:
# Import libraries
import numpy as np
import pandas as pd
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [12]:
class Config:
  train_data_path = "/content/train.tsv"
  test_data_path = "/content/test.tsv"
  checkpoint = "siebert/sentiment-roberta-large-english"
  n_classes = 5
  batch_size= 64
  learning_rate = 3e-5
  n_epochs = 3
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [13]:
train_data = pd.read_csv(Config.train_data_path,sep='\t')
test_data = pd.read_csv(Config.test_data_path,sep='\t')
test_data.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
43383,199444,10609,a strong case
7747,163808,8856,it 's churning ground that has long passed the...
7942,164003,8864,alive
26129,182190,9734,supposed to be a gift
22975,179036,9577,the way of insights


In [14]:
tokenizer = AutoTokenizer.from_pretrained(Config.checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(Config.checkpoint,num_labels=Config.n_classes, ignore_mismatched_sizes=True)



pytorch_model.bin:  72%|#######1  | 1.02G/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def tokenize_function(example):
    return tokenizer(str(example), truncation=True)

tokenized_datasets = train_data["Phrase"].map(tokenize_function)
tokenized_datasets_test = test_data["Phrase"].map(tokenize_function)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
sent_list = train_data["Sentiment"].to_list()
for i in range(len(tokenized_datasets)):
  tokenized_datasets[i]['label'] = sent_list[i]
print(tokenized_datasets[1])

{'input_ids': [0, 250, 651, 9, 11363, 1115, 4216, 16987, 5, 2329, 1580, 14, 99, 16, 205, 13, 5, 29910, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 2}


In [17]:
train_loader = DataLoader(tokenized_datasets,shuffle=True,batch_size=Config.batch_size,collate_fn=data_collator)
test_loader = DataLoader(tokenized_datasets_test,batch_size=Config.batch_size,collate_fn = data_collator)

In [24]:
optimizer = AdamW(model.parameters(),lr=Config.learning_rate)
model.to(Config.device)

num_train_steps = Config.n_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_train_steps
)
progress_bar = tqdm(range(num_train_steps))
model.train()
for epochs in range(Config.n_epochs):
  for batch in train_loader:
    batch = {k: v.to(Config.device) for k,v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)





  0%|          | 0/7317 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x7e400b692a70>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/usr/local/lib/python3.10/dist-packages/tqdm/notebook.py", line 282, in close
    self.disp(bar_style='success', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


KeyboardInterrupt: 

In [26]:
model.eval()
test_predictions = []
for batch in test_loader:
  batch = {k: v.to(Config.device) for k,v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)
  logits = outputs.logits
  predictions = torch.argmax(logits,dim=-1)
  test_predictions.extend(predictions)

In [31]:
test_predictions = [i.item() for i in test_predictions]
test_id = test_data["PhraseId"]

In [33]:
submission = pd.DataFrame(list(zip(test_id,test_predictions)),columns=['PhraseId','Sentiment'])

In [34]:
submission.to_csv("submission.csv",index=False)