In [156]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [157]:
# read and load dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# drop rows with missing values in the 'text' column
train_data = train_data.dropna(subset=['text'])
test_data = test_data.dropna(subset=['text'])

# extract text and sentiment columns
x_train = train_data['text']
y_train = train_data['sentiment']

x_test = test_data['text']
y_test = test_data['sentiment']

In [158]:
display(x_train)

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27480, dtype: object

In [159]:
display(y_train)

0         neutral
1        negative
2        negative
3        negative
4        negative
           ...   
27476    negative
27477    negative
27478    positive
27479    positive
27480     neutral
Name: sentiment, Length: 27480, dtype: object

In [160]:
# preprocess data by removing urls and lowercase all characters
def preprocess(data):
    modified_data = []
    for tweet in data:
        tweet_words = []
        for word in tweet.split():
            if word.startswith('http'):
              word = 'http'
            elif word.startswith('@') and len(word) > 1:
              word = '@user'
            tweet_words.append(word)
        modified_tweet = " ".join(tweet_words)
        modified_tweet = modified_tweet.lower()
        modified_data.append(modified_tweet)
    return modified_data

x_train = preprocess(x_train)
x_test = preprocess(x_test)
display(x_train)
print(len(x_train))

['i`d have responded, if i were going',
 'sooo sad i will miss you here in san diego!!!',
 'my boss is bullying me...',
 'what interview! leave me alone',
 'sons of ****, why couldn`t they put them on the releases we already bought',
 'http - some shameless plugging for the best rangers forum on earth',
 '2am feedings for the baby are fun when he is all smiles and coos',
 'soooo high',
 'both of you',
 'journey!? wow... u just became cooler. hehe... (is that possible!?)',
 'as much as i love to be hopeful, i reckon the chances are minimal =p i`m never gonna get my cake and stuff',
 'i really really like the song love story by taylor swift',
 'my sharpie is running dangerously low on ink',
 'i want to go to music tonight but i lost my voice.',
 'test test from the lg env2',
 'uh oh, i am sunburned',
 's`ok, trying to plot alternatives as we speak *sigh*',
 'i`ve been sick for the past few days and thus, my hair looks wierd. if i didnt have a hat on it would look... http',
 'is back home

27480


In [161]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

# model_checkpoint = 't5-small'
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# model_checkpoint = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

# model_checkpoint = 'bert-base-cased'
# tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

# model_checkpoint = 'roberta-base'
# tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

model_checkpoint = 'roberta-base-openai-detector'
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

In [162]:
x_train_tokenized = tokenizer(x_train, padding=True, truncation=True, return_tensors='pt', max_length=128)
x_test_tokenized = tokenizer(x_test, padding=True, truncation=True, return_tensors='pt', max_length=128)

y_train_mapped = {'neutral': 0, 'negative': 1, 'positive': 2}
y_train_encoded = [y_train_mapped[sentiment] for sentiment in y_train]

In [163]:
train_data['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [164]:
encoded_input = tokenizer(train_data["text"][0])
print(encoded_input)
decoded_input = tokenizer.decode(encoded_input["input_ids"])
print(decoded_input)

{'input_ids': [0, 38, 12905, 417, 33, 2334, 6, 114, 38, 58, 164, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
<s> I`d have responded, if I were going</s>


In [165]:
class MyDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = MyDataset(
    input_ids=x_train_tokenized['input_ids'],
    attention_mask=x_train_tokenized['attention_mask'],
    labels=torch.tensor(y_train_encoded)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [166]:
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

True
cuda


In [167]:
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model = RobertaForSequenceClassification.from_pretrained('roberta-base-openai-detector', num_labels=3, ignore_mismatched_sizes=True)


Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base-openai-detector and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiat

In [168]:
class MySentimentModel(nn.Module):
    def __init__(self, num_classes, pretrained_model_name='roberta-base-openai-detector'):
        super(MySentimentModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(self.roberta.config.hidden_size, 2048)
        self.fc2 = nn.Linear(2048, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        x = self.fc1(pooled_output)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [169]:
num_classes = 3
model = MySentimentModel(num_classes)

In [170]:
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim

model.to(device)

lr=3e-5
weight_decay=0.2

optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
criterion = torch.nn.CrossEntropyLoss()

In [171]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_steps = len(train_loader)

    for step, batch in enumerate(train_loader):
      # Move batch to GPU
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs
        loss = criterion(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if (step + 1) % 50 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{step + 1}/{total_steps}], Loss: {loss.item():.4f}')

    average_loss = total_loss / total_steps
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_loss:.4f}')
    scheduler.step()


Epoch [1/3], Step [50/1718], Loss: 0.9837
Epoch [1/3], Step [100/1718], Loss: 0.8362
Epoch [1/3], Step [150/1718], Loss: 0.5793
Epoch [1/3], Step [200/1718], Loss: 0.4500
Epoch [1/3], Step [250/1718], Loss: 0.4259
Epoch [1/3], Step [300/1718], Loss: 0.7012
Epoch [1/3], Step [350/1718], Loss: 0.6352
Epoch [1/3], Step [400/1718], Loss: 0.7092
Epoch [1/3], Step [450/1718], Loss: 0.6087
Epoch [1/3], Step [500/1718], Loss: 0.7034
Epoch [1/3], Step [550/1718], Loss: 0.7685
Epoch [1/3], Step [600/1718], Loss: 0.3922
Epoch [1/3], Step [650/1718], Loss: 0.7169
Epoch [1/3], Step [700/1718], Loss: 0.7736
Epoch [1/3], Step [750/1718], Loss: 0.6114
Epoch [1/3], Step [800/1718], Loss: 0.4564
Epoch [1/3], Step [850/1718], Loss: 0.4765
Epoch [1/3], Step [900/1718], Loss: 0.4903
Epoch [1/3], Step [950/1718], Loss: 0.5498
Epoch [1/3], Step [1000/1718], Loss: 0.5680
Epoch [1/3], Step [1050/1718], Loss: 0.4458
Epoch [1/3], Step [1100/1718], Loss: 0.4462
Epoch [1/3], Step [1150/1718], Loss: 0.1596
Epoch [1

In [172]:
import numpy as np
from sklearn.metrics import accuracy_score
# evaluate on test data after training
model.eval()
with torch.no_grad():
    x_test_tokenized = tokenizer(x_test, padding=True, truncation=True, return_tensors='pt', max_length=128)
    # move test data to GPU
    inputs_test = x_test_tokenized['input_ids'].to(device)
    attention_mask_test = x_test_tokenized['attention_mask'].to(device)

    outputs = model(inputs_test, attention_mask=attention_mask_test)
    predictions = torch.argmax(outputs, dim=1).tolist()

# calculate accuracy
true_labels = y_test.map({'neutral': 0, 'negative': 1, 'positive': 2})
predicted_labels = np.array(predictions)
accuracy = accuracy_score(true_labels, predicted_labels)

print(f'Accuracy: {accuracy * 100:.2f}%')
# print("True Labels:", true_labels.tolist())
# print("Predicted Labels:", predicted_labels.tolist())

Accuracy: 80.39%
