In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
import preprocessor as p

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
encoded_label_dict = {"n" : 0, "y" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

In [5]:
fp = "../data/REVISION DATASET_b.xlsx"
df = pd.read_excel(fp,sheet_name="data")

In [6]:
df.head()

Unnamed: 0,fullname,is_retweet,likes,replies,retweets,text,expresses a pain point,timestamp,timestamp_epochs,tweet_id,tweet_url,user_id,username,BRAND,Type of Pain,Subjectivity,Second category,Third category,Fourth category,Dataset
0,Emmanuel Olabode,0,1,1,1,@ifemeetstech your pr team can help bridge the...,y,42176,1434841837,612397168788406016,/olabodeEO/status/612397168788406272,1955234486,olabodeEO,gap,Operational issues,,,,,Original
1,Alviniecððâ¨,0,0,0,0,mcdonalds really bein missing uhp people food ...,y,2014-09-01 23:25:17,1409613917,506583600599662592,/ohhamazing/status/506583600599662592,2723652417,ohhamazing,mcdonalds,Product feature or quality,,,,,Original
2,Bobby,0,0,0,0,"if they thought that little of him, why was he...",y,43720,1568242832,1171921495309910016,/Bobbythegreat/status/1171921495309914112,33740752,Bobbythegreat,gap,Product feature or quality,,,,,Original
3,Elgen Bodenstien,0,0,0,0,when towns have locally owned business capita...,y,2019-09-05 23:34:22,1567726462,1169755684122086912,/bodenstien/status/1169755684122087424,1167585352464424960,bodenstien,walmart,Company's image,,,,,Original
4,Robyn,0,3,1,1,@arma_vancouver health information in records ...,y,2019-01-23 22:43:08,1548283388,1088205518886190976,/RobynCBird/status/1088205518886191104,2206030555,RobynCBird,fitbit,Company's image,,,,,Original


In [7]:
df = df[["text","expresses a pain point"]]

In [8]:
df["target"] = df["expresses a pain point"].apply(lambda x: encode_label(x))

In [9]:
def preprocess(txt):
    return p.clean(txt)

In [10]:
df["clean_text"] = df["text"].apply(lambda x: preprocess(x))

In [11]:
df.head()

Unnamed: 0,text,expresses a pain point,target,clean_text
0,@ifemeetstech your pr team can help bridge the...,y,1,your pr team can help bridge the communication...
1,mcdonalds really bein missing uhp people food ...,y,1,mcdonalds really bein missing uhp people food ...
2,"if they thought that little of him, why was he...",y,1,"if they thought that little of him, why was he..."
3,when towns have locally owned business capita...,y,1,when towns have locally owned business capital...
4,@arma_vancouver health information in records ...,y,1,health information in records means vulnerabil...


In [12]:
df.target.value_counts()

0    3348
1    1252
Name: target, dtype: int64

In [13]:
model_name = "roberta-base"
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

In [14]:
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [15]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.clean_text[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.target[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [16]:
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=2021)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, stratify=None, random_state=2021)

In [17]:
# Creating the dataset and dataloader
train_dataset = train.reset_index(drop=True)
valid_dataset = valid.reset_index(drop=True)
test_dataset = test.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
validating_set = Triage(valid_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (4600, 4)
TRAIN Dataset: (3680, 4)
VALID Dataset: (460, 4)
TEST Dataset: (460, 4)


In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
model = RobertaForSequenceClassification.from_pretrained(model_name)
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [20]:
# Creating the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [21]:
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [22]:
# Defining the training function on the 80% of the dataset for tuning the roberta model
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, attention_mask=mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits
        tr_loss += loss
        big_val, big_idx = torch.max(logits, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _!=0 and _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [23]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    n_wrong = 0
    total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, attention_mask=mask, labels=targets)
            loss = outputs.loss
            logits = outputs.logits
            tr_loss += loss
            big_val, big_idx = torch.max(logits, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _!=0 and _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [24]:
tokenizer.pad_token_id

1

In [25]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 100 steps: 0.5815014839172363
Training Accuracy per 100 steps: 73.14356435643565
Training Loss per 100 steps: 0.541479229927063
Training Accuracy per 100 steps: 75.24875621890547
Training Loss per 100 steps: 0.5160611271858215
Training Accuracy per 100 steps: 75.53986710963456
Training Loss per 100 steps: 0.49439898133277893
Training Accuracy per 100 steps: 76.74563591022444
The Total Accuracy for Epoch 0: 77.77173913043478
Training Loss Epoch: 0.48072177171707153
Training Accuracy Epoch: 77.77173913043478


In [26]:
acc = valid(model, validating_loader)
print("Accuracy on validation data = %0.2f%%" % acc)



Validation Loss Epoch: 0.3472824692726135
Validation Accuracy Epoch: 84.34782608695652
Accuracy on validation data = 84.35%


In [27]:
acc = valid(model, testing_loader)
print("Accuracy on Test data = %0.2f%%" % acc)



Validation Loss Epoch: 0.36179542541503906
Validation Accuracy Epoch: 84.78260869565217
Accuracy on Test data = 84.78%


In [54]:
# Save the model
output_model_file = '../models/ft-roberta-ep1.pt'

model_to_save = model
torch.save(model_to_save, output_model_file)

print('All files saved')

All files saved


#### Inference

In [10]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch

In [11]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [12]:
model = torch.load('../../data/classification/models/ft-roberta-amazonreviews.pt')

In [28]:
def predict(query, model, tokenizer, device="cuda"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

In [38]:
query = """This is bad.
"""
predict(query,model,tokenizer)

0.02814316563308239

In [48]:
preds, preds_probas = [],[]
for i, row in test_dataset.iterrows():
    query = row["clean_text"]
    pred = predict(query,model,tokenizer)
    preds_probas.append(pred)
    if pred >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

In [49]:
from sklearn.metrics import confusion_matrix
y_true = test_dataset.target.values
y_pred = preds
confusion_matrix(y_true,y_pred)

array([[304,  28],
       [ 45,  83]])

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [51]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 84.1304347826087; Precision:74.77477477477478; Recall:64.84375


In [52]:
print(classification_report(y_true, y_pred, target_names=["n","y"]))

              precision    recall  f1-score   support

           n       0.87      0.92      0.89       332
           y       0.75      0.65      0.69       128

    accuracy                           0.84       460
   macro avg       0.81      0.78      0.79       460
weighted avg       0.84      0.84      0.84       460

