# This notebook is an example of using XLM roberta model to classify sentiment based on text 

In [None]:
# !pip install "tensorflow == 2.8.0"
!pip install "torch == 1.10.2"

In [None]:
# base packages for this task
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
!pip install pandas
!pip install spacy_langdetect
!pip install spacy
!python3 -m spacy download en

In [None]:
# url for loading the dataset
url = 'https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv'

## -------------
## Load dataset

In [None]:
import pandas as pd
df = pd.read_csv('data/Corona_NLP_train.csv', encoding='ISO-8859-1')
df.head()

## ---------------
## Language detection

In [None]:
# language detection 
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

def detect_lan(text) :

    doc = nlp(text)
    detect_language = doc._.language 
    detect_language = detect_language['language']

    return(detect_language)

df['nation'] = df['OriginalTweet'].apply(lambda x: detect_lan(x))

In [None]:
# total number of english samples
dict(df.groupby('nation').count()['UserName'])['en']

In [None]:
# total number of non english samples
sum([dict(df.groupby('nation').count()['UserName'])[x] for x in dict(df.groupby('nation').count()['UserName']).keys() if x not in ['en', 'UNKNOWN']])

In [None]:
# what are the categories and are they balanced?
dict(df.groupby('Sentiment').count()['UserName'])['Extremely Negative'] / dict(df.groupby('Sentiment').count()['UserName'])['Positive']

## ------------------
## Text preprocessing 

In [None]:
# how the text looks like 
df.OriginalTweet.values[3]

In [None]:
# text processing 
import re
import string
def clean_text(text):
    text = text.lower() # to lower case 
    text = re.sub('https:\/\/\S+', '', text) # remove links
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub(r'[^ \w\.]', '', text) # remove next line 
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    
    return text

In [None]:
df['Text'] = df.OriginalTweet.apply(lambda x: clean_text(x))

In [None]:
df.Text

## --------------------
## Load tokenizer

In [None]:
!pip install "transformers ==4.16.2"
!pip install "sentencepiece==0.1.96"

In [None]:
# load tokens

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# Add words into token 
tokenizer.add_tokens(['covid', 'coronavirus'])

print(tokenizer.tokenize('covid'))
print(tokenizer.tokenize('coronavirus'))

## --------------------
## Tokenized sentence

In [None]:
# tokenize the text feature 
tokenized_feature_raw = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            df.Text.values.tolist(), 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True      
                   )

# collect tokenized sentence length 
token_sentence_length = [len(x) for x in tokenized_feature_raw['input_ids']]
print('max: ', max(token_sentence_length))
print('min: ', min(token_sentence_length))

# plot the distribution
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 8))
plt.hist(token_sentence_length, rwidth = 0.9)
plt.xlabel('Tokenized Sentence Length', fontsize = 18)
plt.ylabel('# of Samples', fontsize = 18)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)

In [None]:
# identify features and target
features = df.Text.values.tolist()
target = df.Sentiment.values.tolist()

In [None]:
# tokenize features 
MAX_LEN = 128
tokenized_feature = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            features, 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True,
                            # Add empty tokens if len(text)<MAX_LEN
                            padding = 'max_length',
                            # Truncate all sentences to max length
                            truncation=True,
                            # Set the maximum length
                            max_length = MAX_LEN, 
                            # Return attention mask
                            return_attention_mask = True,
                            # Return pytorch tensors
                            return_tensors = 'pt'       
                   )

In [None]:
tokenized_feature

## --------------------
## Train Test split and Dataloader

In [None]:
!pip install sklearn

In [None]:
# convert label into numeric 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(target)
target_num = le.transform(target)

In [None]:
# Use 80% for training and 20% for validation
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(tokenized_feature['input_ids'], 
                                                                                                                      target_num,
                                                                                                                      tokenized_feature['attention_mask'],
                                                                                                      random_state=2018, test_size=0.2, stratify=target)

In [None]:
batch_size = 16
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
validation_data = TensorDataset(validation_inputs, validation_masks, torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## --------------------
## Model

In [None]:
# BertForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base", 
    # Specify number of classes
    num_labels = len(set(target)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False
)

# # tell pytorch to run this model on GPU
# model.cuda()

# Receive the full size of the new word
model.resize_token_embeddings(len(tokenizer))

# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )


# Number of training epochs
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

## --------------------
## Train

In [None]:
# Training
import time

# Store the average loss after each epoch 
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    print('Training on epoch: ', epoch_i)

    # set start time 
    t0 = time.time()

    # reset total loss
    total_loss = 0

    # model in training 
    model.train()

    # loop through batch 
    # our batch size is 16
    for step, batch in enumerate(train_dataloader):

        # Progress update every batch
        if step % 16 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        # clear any previously calculated gradients 
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch)
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # get loss
        loss = outputs[0]

        # total loss
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters 
        optimizer.step()

        # Update learning rate
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))