# RoBERTa for Sentiment Analysis

# Imports and installs

In [1]:
import nltk
import string
import re
import time
import numpy as np
import pandas as pd

import json
from tqdm.notebook import tqdm
from uuid import uuid4


from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split


In [10]:

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences

## PyTorch Transformer
from pytorch_transformers import RobertaModel, AutoTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [3]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


# Einlesen

Apple Sentiment

In [2]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

data.sentiment = data.sentiment.apply(lambda x: x + 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## US Airline Sentiment

In [3]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename(columns={"airline_sentiment" : "sentiment"}, inplace=True)

thisdict =	{
  "negative": 0,
  "neutral": 1,
  "positive": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


## T4SA

In [3]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv("data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": 0,
  "NEU": 1,
  "POS": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


General Text Cleaning

In [4]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [5]:
df_train

Unnamed: 0,text,sentiment
0,for all who served and fought for our country ...,2
1,rt florida teen only fourth person in last ...,1
2,wireless digital lcd color baby monitor camera...,1
3,found another one and i hope this isnt coming ...,0
4,happy thanksgiving im thankful you all cant draw,2
...,...,...
790566,rt ok but can we talk about how kelsi from hi...,1
790567,rt and this time with link thanks writing...,2
790568,living for music focus on europe news news ...,1
790569,world of final fantasy cinematic anime openin...,1


# RoBERTa Configuration
Only for non simple Training

Tokenization

In [6]:
config = RobertaConfig.from_pretrained('roberta-base')
# Set number of output labels
config.num_labels = 3

In [7]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', padding='longest')
model = RobertaForSequenceClassification(config)

In [24]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    ## Zero-pad sequence lenght
    if zero_pad:
        while len(tokens) < max_seq_length:
            tokens.append(tokenizer.pad_token)
            
    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [12]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.sentiment[index]
        X, _  = prepare_features(text)
        y = label
        #y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [25]:
training_set = Intents(df_train)
testing_set = Intents(df_test)

In [26]:
training_set.__getitem__(0)[0]

tensor([[   0,   13,   70,   54, 1665,    8, 4951,   13,   84,  247, 1437,   52,
         3392,   47, 1437, 6641, 1182,  594,  366, 8649, 2463, 4823, 1208,    2]])

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

Create model

In [28]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 4}

In [29]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [30]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [22]:
## Test Forward Pass
inp = training_set.__getitem__(1002)[0].cuda()
output = model.forward(inp)[0]
print(output)
print(inp)


print(torch.max(output, 1))

tensor([[-0.2494, -0.0248,  0.5536]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[    0,   910,    90,  1437,   122,    15,   740, 15688,  7843, 33465,
          3141,  1437, 22437,  1576,    15,  1923,  4361,  1437,  1183,    24,
           697,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     2]], device='cuda:0')
torch.return_types.max(
values=tensor([0.5536], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([2], device='cuda:0'))


# Training (non simple)

In [31]:
max_epochs = 2
model = model.train()
for epoch in tqdm(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%1000 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 1.234860897064209. Accuracy: 15.318218939561257%
Iteration: 1000. Loss: 0.3728683590888977. Accuracy: 53.362986856230066%
Iteration: 2000. Loss: 1.6397678852081299. Accuracy: 36.77507665915056%
Iteration: 3000. Loss: 0.7626887559890747. Accuracy: 57.21315096074333%
Iteration: 4000. Loss: 1.0447624921798706. Accuracy: 65.49670506900608%



KeyboardInterrupt: 

Modell abspeichern

In [None]:
torch.save(model.state_dict(), 'data/roberta_state_dict_05092020.pth')

Laden

In [None]:
model_path = 'data/roberta_state_dict_05092020.pth'

In [None]:
model.load_state_dict(torch.load(model_path, map_location=device))

## Evaluation

In [None]:
def get_sentiment(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=pred_label
  return prediction

In [None]:
df_test.insert(2, "predsentiment", df_test.text.map(lambda x: get_sentiment(x).item()), True)


In [None]:
df_test.predsentiment.unique()

array([1, 0, 2])

In [None]:
print(metrics.f1_score(df_test.sentiment, df_test.predsentiment, average = None,))
print(metrics.accuracy_score(df_test.sentiment, df_test.predsentiment))

# Using SimpleTransformers

In [9]:
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('roberta', 'roberta-base', num_labels=3, args={
    'learning_rate':3e-5,
    'num_train_epochs': 1,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 48,
    'eval_batch_size': 16,
    'max_seq_length': 512,
    'n_gpu' : 16,
    'fp16': False
})

model.train_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=790571.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=16471.0, style=ProgressStyle(descri…



Running loss: 1.113239



Running loss: 0.171036



Running loss: 0.006994



## Evaluation

In [10]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=389386.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=24337.0, style=ProgressStyle(des…




In [11]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.sentiment, preds_test, average=None))
print(accuracy_score(df_test.sentiment, preds_test))

[0.96378614 0.98706558 0.98137075]
0.9817225067157012


In [12]:
data

Unnamed: 0,text,sentiment
0,what said,1
1,plus youve added commercials to the experience...,2
2,i didnt today must mean i need to take anothe...,1
3,its really aggressive to blast obnoxious ente...,0
4,and its a really big bad thing about it,0
...,...,...
14635,thank you we got on a different flight to chicago,2
14636,leaving over minutes late flight no warnin...,0
14637,please bring american airlines to blackberry,1
14638,you have my money you change my flight and d...,0
