# RoBERTa for Sentiment Analysis

# Imports and installs

Imports for SimpleTransformers

In [2]:
import os
# Higher versions have problems with CUDA
!pip install transformers==2.11.0
!pip install simpletransformers==0.41.1
!git clone https://github.com/NVIDIA/apex
os.chdir('apex')
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
os.chdir('..')

Collecting transformers==2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 7.1MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 27.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 42.9MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K 

Imports for regular transformers

In [None]:
!pip install pytorch-transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 191 kB/s eta 0:00:01
Installing collected packages: pytorch-transformers
Successfully installed pytorch-transformers-1.2.0


In [1]:
import nltk
import string
import re
import time
import numpy as np
import pandas as pd

import json
from tqdm.notebook import tqdm
from uuid import uuid4


from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split


In [None]:

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [None]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


# Einlesen

Apple Sentiment

In [2]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

data.sentiment = data.sentiment.apply(lambda x: x + 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## US Airline Sentiment

In [4]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename({'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": 0,
  "neutral": 1,
  "positive": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## T4SA

In [None]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv("data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": 0,
  "NEU": 1,
  "POS": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


General Text Cleaning

In [None]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train

0

# RoBERTa Configuration
Only for non simple Training

Tokenization

In [None]:
config = RobertaConfig.from_pretrained('roberta-base')
# Set number of output labels
config.num_labels = 3
config

100%|██████████| 481/481 [00:00<00:00, 289075.83B/s]


{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

100%|██████████| 898823/898823 [00:01<00:00, 821255.61B/s]
100%|██████████| 456318/456318 [00:00<00:00, 500351.33B/s]


In [None]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [None]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  1308,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [None]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.sentiment[index]
        X, _  = prepare_features(text)
        y = label
        #y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
#train_size = 0.8
#train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
#test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [None]:
training_set = Intents(df_train)
testing_set = Intents(df_test)

In [None]:
training_set.__getitem__(2)[0].shape

AttributeError: 'DataFrame' object has no attribute 'airline_sentiment'

In [None]:
model(training_set.__getitem__(0)[0])

Create model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [None]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [None]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [None]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
## Test Forward Pass
inp = training_set.__getitem__(1002)[0].cuda()
output = model.forward(inp)[0]
print(output)
print(inp)


print(torch.max(output, 1))

# Training (non simple)

In [None]:
max_epochs = 5
model = model.train()
for epoch in tqdm(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%1000 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 1.0941996574401855. Accuracy: 31.226765799256505%
Iteration: 100. Loss: 0.9253912568092346. Accuracy: 50.74349442379182%
Iteration: 200. Loss: 5.842207908630371. Accuracy: 40.14869888475837%
Iteration: 300. Loss: 4.406252861022949. Accuracy: 17.286245353159853%
Iteration: 400. Loss: 12.58192253112793. Accuracy: 50.74349442379182%
Iteration: 500. Loss: 11.721990585327148. Accuracy: 7.992565055762082%
Iteration: 600. Loss: 2.538684368133545. Accuracy: 47.026022304832715%
Iteration: 700. Loss: 1.0911545753479004. Accuracy: 10.037174721189592%
Iteration: 800. Loss: 0.00029659271240234375. Accuracy: 50.74349442379182%
Iteration: 900. Loss: 3.524245023727417. Accuracy: 49.62825278810409%
Iteration: 1000. Loss: 0.36021649837493896. Accuracy: 46.09665427509294%
EPOCH -- 1
Iteration: 0. Loss: 0.1635899543762207. Accuracy: 48.698884758364315%
Iteration: 100. Loss: 0.0005340576171875. Accuracy: 41.82156133828996%
Iteration: 200. Loss: 3.6338376998901367. Accuracy: 5

Modell abspeichern

In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/Colab Notebooks/data/roberta_state_dict_05092020.pth')

Laden

In [None]:
model_path = '/content/drive/My Drive/Colab Notebooks/data/roberta_state_dict_05092020.pth'

In [None]:
model.load_state_dict(torch.load(model_path, map_location=device))

## Evaluation

In [None]:
def get_sentiment(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=pred_label
  return prediction

In [None]:
df_test.insert(2, "predsentiment", df_test.text.map(lambda x: get_sentiment(x).item()), True)


In [None]:
df_test.predsentiment.unique()

array([1, 0, 2])

In [None]:
print(metrics.f1_score(df_test.sentiment, df_test.predsentiment, average = None,))
print(metrics.accuracy_score(df_test.sentiment, df_test.predsentiment))

[0.58421053 0.         0.        ]
0.41263940520446096


# Using SimpleTransformers

In [None]:
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('roberta', 'roberta-base', num_labels=3, args={
    'learning_rate':3e-5,
    'num_train_epochs': 10,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 4,
    'eval_batch_size': 4,
    'max_seq_length': 512,
    'fp16': True
})

model.train_model(df_train)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=9808.0), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=2452.0, style=ProgressStyle(descrip…

Running loss: 1.172263



Running loss: 0.884008



Running loss: 1.069536Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 1.325565Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 0.628085Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Running loss: 1.003115Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Running loss: 0.492915Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Running loss: 0.854714Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0
Running loss: 0.018200

## Evaluation

In [4]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=538.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=135.0, style=ProgressStyle(descr…




In [6]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.sentiment, preds_test, average=None))
print(accuracy_score(df_test.sentiment, preds_test))

[0.90789474 0.89552239 0.71428571]
0.8866171003717472
