<a href="https://colab.research.google.com/github/koleshjr/Swahili_News_Nlp/blob/main/Vaccinate_Training_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TO VACCINATE OR NOT VACCINATE

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# !pip install transformers 


In [3]:
import numpy as np 
import pandas as pd 
import os, random, sys, time, re, copy, string

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as D
from torch.nn.utils.rnn import pad_sequence

# K-Fold spliter
from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

# Transformers library
from transformers import *

In [4]:
# Path to train csv file
DATA_PATH = '/content/gdrive/MyDrive/vaccinate/'


MODEL_PATH = '../input/roberta-transformers-pytorch/roberta-base'


MODEL_NAME = 'optimus_prime'
VOCAB_PATH = MODEL_PATH

N_FOLDS = 5
EPOCHES = 5
BATCH_SIZE = 24
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_SEQUENCE_LENGTH = 112
LR = 2e-5

# error log
sys.stderr = open('err.txt', 'w')

In [5]:
SEED = 7117
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

### Loading the Dataset

In [6]:
path = '/content/gdrive/MyDrive/vaccinate/'
train =pd.read_csv(path + 'Train.csv')
test =pd.read_csv(path + 'Test.csv')

display(train.head(), train.shape, test.head(), test.shape)

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


(10001, 4)

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


(5177, 2)

In [7]:
train['label'].value_counts()

 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: label, dtype: int64

remove the label with only one value because it is likely an outlier

In [8]:
train = train.dropna()
train['label'].value_counts()

 0.0    4908
 1.0    4053
-1.0    1038
Name: label, dtype: int64

In [9]:
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [10]:
train['label'].unique()

array([ 0.,  1., -1.])

In [11]:
train['agreement'].value_counts()

1.000000    5866
0.666667    3894
0.333333     239
Name: agreement, dtype: int64

So this is a regression problem


### Text Cleaning

In [12]:
## Remove URLS, htmls, emojis and punctuations
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

train['safe_text'] = train['safe_text'].apply(lambda x: remove_URL(x))
train['safe_text'] = train['safe_text'].apply(lambda x: remove_emoji(x))
train['safe_text'] = train['safe_text'].apply(lambda x: remove_html(x))
train['safe_text'] = train['safe_text'].apply(lambda x: remove_punct(x))

In [13]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   5177 non-null   object
 1   safe_text  5176 non-null   object
dtypes: object(2)
memory usage: 81.0+ KB


In [15]:
test['safe_text'] = test['safe_text'].astype('str')
test['safe_text'] = test['safe_text'].apply(lambda x: remove_URL(x))
test['safe_text'] = test['safe_text'].apply(lambda x: remove_emoji(x))
test['safe_text'] = test['safe_text'].apply(lambda x: remove_html(x))
test['safe_text'] = test['safe_text'].apply(lambda x: remove_punct(x))

In [16]:
subm = pd.read_csv(path + 'SampleSubmission.csv')

# https://www.kaggle.com/abhishek/step-1-create-folds
# df_size = train.shape[0]
# num_bins = int(np.floor(1 + np.log2(df_size)))
# # bin targets
# y = pd.cut(train["label"], bins=num_bins, labels=False)

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [17]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModel.from_pretrained('roberta-base',num_labels=1)


train['token'] = train.safe_text.apply(tokenizer)
test['token'] = test.safe_text.apply(tokenizer)



In [18]:
test.head()

Unnamed: 0,tweet_id,safe_text,token
0,00BHHHP1,4 a vaccine given 2 healthy peeps FDA thin...,"[input_ids, attention_mask]"
1,00UNMD0E,Students starting school without whooping coug...,"[input_ids, attention_mask]"
2,01AXPTJF,Im kinda over every ep of being ripped from t...,"[input_ids, attention_mask]"
3,01HOEQJW,How many innocent children die for lack of vac...,"[input_ids, attention_mask]"
4,01JUKMAO,CDC eyeing bird flu vaccine for humans though ...,"[input_ids, attention_mask]"


In [19]:
class LitDataset(D.Dataset):
    
    def __init__(self, token, target):
        self.token = token
        self.target = target
        
    def __len__(self):
        return self.token.shape[0]

    def __getitem__(self, idx):
        return torch.tensor(self.token[idx].input_ids), \
                torch.tensor(self.token[idx].attention_mask), self.target[idx]
    
def collate_fn(batch):
    ids, attns, targets = zip(*batch)
    ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    targets = torch.tensor(targets).float().to(DEVICE)
    return ids, attns, targets

def collate_fn_test(batch):
    ids, attns, idxs = zip(*batch)
    ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    attns = pad_sequence(attns, batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)
    return idxs, ids, attns


In [20]:
ds = LitDataset(train.token, train.label)
test_ds = LitDataset(test.token, test.tweet_id)

tloader = D.DataLoader(test_ds, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn = collate_fn_test, num_workers=0)

In [21]:
### Table for results
header = r'''
            Train         Validation
Epoch |  MSE  |  RMSE |  MSE  |  RMSE | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:7.3f}'*4 + '\u2502{:6.2f}'

In [22]:
@torch.no_grad()
def validation_fn(model, loader, loss_fn):
    tloss = []
    model.eval();
    for texts, attns, target in loader:
        outputs = model(texts, attention_mask=attns)
        loss = loss_fn(outputs.logits.squeeze(-1), target)
        tloss.append(loss.item())
    tloss = np.array(tloss).mean()
    return tloss

def oof_train(ds, cv, y, epochs = EPOCHES):
    
    loss_fn = torch.nn.MSELoss()
    
    for fold, (train_idx, valid_idx) in enumerate(cv.split(range(len(ds)), y)):
        
        train_ds = D.Subset(ds, train_idx)
        loader = D.DataLoader(train_ds, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn = collate_fn,num_workers=0)
        
        valid_ds = D.Subset(ds, valid_idx)
        vloader = D.DataLoader(valid_ds, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn = collate_fn,num_workers=0)
        
        model = AutoModel.from_pretrained('roberta-base',num_labels=1).to(DEVICE);
        
        optimizer = optim.AdamW(model.parameters(), LR,
                                betas=(0.9, 0.999), weight_decay=1e-1)
        scheduler = get_constant_schedule_with_warmup(optimizer, 35)
        print(header)
        
        # init state
        best_metric = np.inf
        best_model = model.state_dict()
        
        for epoch in range(1, epochs+1):      
            start_time = time.time()
            tloss = []          
            model.train()
            
            for texts, attns, target in loader:
                optimizer.zero_grad()
                outputs = model(texts, attention_mask=attns)
                loss = loss_fn(outputs[0].squeeze(-1), target)
                tloss.append(loss.item())
                loss.backward()
                optimizer.step()
                scheduler.step()
            tloss = np.array(tloss).mean()
            vloss = validation_fn(model, vloader, loss_fn)
            tmetric = tloss**.5
            vmetric = vloss**.5
            print(raw_line.format(epoch,tloss,tmetric,vloss,vmetric,(time.time()-start_time)/60**1))
            del loss, outputs
            
            if best_metric > vmetric:
                with torch.no_grad():
                    best_metric = vmetric
                    best_model = copy.deepcopy(model.state_dict())
            
        # Save final state to the checkpoint
        filename = f'{MODEL_NAME}_fold_{fold:02}.pt'
        checkpoint = {
            'model' : model.state_dict(),
            'best_model' : best_model,
            'best_metric' : best_metric,
        }
        torch.save(checkpoint,  filename)
    
        del model, vloader, loader, train_ds, valid_ds
        torch.cuda.empty_cache()



In [28]:

# y = train['label']
# oof_train(ds, cv, y, epochs = EPOCHES)




### Inference


In [None]:
model = AutoModelForSequenceClassification.from_pretrained( 
                  MODEL_PATH, num_labels=1).to(DEVICE);

for fold in range(N_FOLDS):
    
    filename = f'{MODEL_NAME}_fold_{fold:02}.pt'
    weights = torch.load(filename)['model']
    model.load_state_dict(weights);
    model.eval();
    del weights
    # Get prediction for test set
    ids, preds = [], [] 
    with torch.no_grad():
        for batch_ids, texts, attn in tloader:
            outputs = model(texts, attention_mask=attn)
            ids += batch_ids
            preds.append(outputs.logits.detach().squeeze(-1).cpu().numpy())

    # Save prediction of test set
    preds = np.concatenate(preds)
    subm.loc[ids, 'target']  =  subm.loc[ids, 'target'].values + preds / N_FOLDS

# Save to the file
subm.to_csv('submission.csv')

In [None]:
# clean saves
!rm -r *.pt
!rm err.txt