In [2]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
import os
import sys
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import numpy as np
from sklearn.metrics import f1_score
import random
import torch.nn.utils.prune
import re
import os



In [3]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
def process_tweet(tweet):

    tweet2 = re.sub(r'^RT[\s]+','', tweet)
   
    #remove hyperlinks
    tweet2 = re.sub(r'https?://[^\s\n\r]+', '', tweet2)
    
    #remove hashtag by removing the hast #sign from the word
    tweet2 = re.sub(r'#','',tweet2)
    
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    #convert into lower case
    tweets_clean = tweet.lower()

    return tweets_clean

In [6]:
data = pd.read_csv("../data/processed/sentiment_label_data_sample.csv")

In [7]:
#cleaning the tweets 
data["tweets"] = data["text"].map(lambda x : process_tweet(x))

In [8]:
data["tweets"].sample(10)

54750    holy shit. just saw the surgeon who did my las...
73756            thoughts are with the af crew &amp;  pax 
93620    i just got a sweet little add on for fire fox....
79969    @momspective that smartfood better be good   o...
74747    is bored in work! and its raining again! team ...
35591    melanie, i cant explain in words how excited i...
59                        @leslie_knope u will beat oprah 
36260    it's just one of those days where i want my fa...
33577                                i have a stiff neck  
17863    wow, a high of 64 today. i love this weather i...
Name: tweets, dtype: object

In [9]:
#splitting the dataset into training and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.index.values, data.sentiment.values, test_size = 0.333, random_state=0)

In [10]:
data["data_type"] = ['not_set']*data.shape[0]

In [11]:
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_test, 'data_type'] = 'test'

In [12]:
data.sample(10)

Unnamed: 0.1,Unnamed: 0,sentiment,text,tweets,data_type
65308,1087359,1,Hello all ... (I won't do this often) ... Just...,hello all ... (i won't do this often) ... just...,test
17388,1411890,1,gotta work... off for now.,gotta work... off for now.,train
32442,941765,1,"Oh, shoot. And I don't have a ride to or from ...","oh, shoot. and i don't have a ride to or from ...",train
28526,1364753,1,"Getting Prepared for the next 3 weeks, School ...","getting prepared for the next 3 weeks, school ...",train
28704,658450,0,Tears x,tears x,test
83822,1000349,1,@helga_hansen http://twitpic.com/5of2w - Oh my...,@helga_hansen http://twitpic.com/5of2w - oh my...,train
7279,708697,0,so bored! I is housebound for the weekend x,so bored! i is housebound for the weekend x,test
13611,37726,0,@MsHollyOlly3 hahah i have journalism,@mshollyolly3 hahah i have journalism,train
70579,920192,1,"Happy Mothers Day to all Mothers in America, o...","happy mothers day to all mothers in america, o...",train
63222,1058361,1,LOOOOK LOOK!!!! 50 folllowers!!!! yayyy!!!! th...,looook look!!!! 50 folllowers!!!! yayyy!!!! th...,train


In [13]:
# Encoding the data

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    return_tensors='pt'
                            
)

encoded_data_test = tokenizer.batch_encode_plus(
    data[data.data_type=='test'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    return_tensors='pt'
                            
)

In [15]:
label_array = data["sentiment"].unique()
label_dict = dict(enumerate(label_array.flatten(), 1))
label_dict

{1: 0, 2: 1}

In [16]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type=='train'].sentiment.values)

# test dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(data[data.data_type=='test'].sentiment.values)

In [17]:
# Dividing datasets into train and test
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test,attention_masks_test, labels_test)

In [18]:
#Setting up BERT pretrained model
from transformers import BertForSequenceClassification

In [19]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels = len(label_dict),
    output_attentions=False,
    output_hidden_states=False
    
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [21]:
batch_size = 4

dataloader_train = DataLoader(
   dataset_train, 
   sampler=RandomSampler(dataset_train),
   batch_size=batch_size)

dataloader_test = DataLoader(
   dataset_test, 
   sampler=RandomSampler(dataset_test),
   batch_size=batch_size)

In [22]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [23]:
#setting hyper parameters
optimizer = torch.optim.AdamW(
 model.parameters(),
    lr=1e-5,
    eps=1e-8   
)

In [24]:
epochs = 10
scheduler = get_linear_schedule_with_warmup(
 optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

In [25]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [26]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [27]:
def accuracy(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    lables_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = pred_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Creating our Training Loop

In [28]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cpu


In [30]:
def evaluate(dataloader_test):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_test:
        
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]
                 }
        with torch.no_grad():
            outputs = model(**inputs)
            
            loss = outputs[0]
            logits = outputs[1]
            loss_val_total += loss.item()
            
            logits = logits.detach().cpu().numpy()
            label_ids = inputs['labels'].cpu().numpy()
            predictions.append(logits)
            true_vals.append(label_ids)
        
        loss_val_avg = loss_val_total/len(dataloader_test)
        
        predictions = np.concatenate(predictions, axis=0)
        true_vals = np.concatenate(true_vals, axis=0)
        
        return loss_val_avg, predictions, true_vals
                              

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train,
                        desc='epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    for batch in progress_bar:
        
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids' : batch[0],
            'attention_mask' : batch[1],
            'labels' : batch[2]
        }
        outputs = model(**inputs)
        
        optimizer.zero_grad()
        loss= outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        
        optimizer.step()
        
                                   
    print('training_loss : ',(loss.item()/len(batch)))    
    torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    
    val_loss, predictions, true_vals = evaluate(dataloader_test)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
  

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 1:   0%|          | 0/16675 [00:00<?, ?it/s]