<a href="https://colab.research.google.com/github/jmlDC/MediaBias-Thesis22-23/blob/Modeling/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Modeling

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

dir  = "/content/gdrive/MyDrive/THESIS-MS/Git-Thesis22-23/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv(f'{dir}Official/MFC_prepared.csv', usecols=["code_frames", "annotations"], header=0)
df.code_frames = df.code_frames.astype(int)

# Try 1

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [6]:
# https://github.com/susanli2016/NLP-with-Python/blob/master/Text_Classification_With_BERT.ipynb

In [7]:
df

Unnamed: 0,code_frames,annotations
0,10,Immigrants without HOPE need help entering col...
1,5,"But in the eyes of the law, he is an illegal i..."
2,15,"Reaction to Tancredo, Lamm as predicted"
3,13,"That, said the congressman, is what always hap..."
4,1,"$50,000 per entry"
...,...,...
46794,11,Smoking is becoming a social taboo
46795,5,Nor does it aid lawyers seeking novel ways to...
46796,15,'Ashes to Ashes'
46797,15,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS


In [27]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.code_frames.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.code_frames.values
                                                )

In [28]:
df['data_type'] = ['not_set']*df.shape[0]


df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['code_frames', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,annotations
code_frames,data_type,Unnamed: 2_level_1
1,train,2836
1,val,500
2,train,216
2,val,38
3,train,1263
3,val,223
4,train,1009
4,val,178
5,train,8236
5,val,1454


In [29]:
numBer_labels = 16

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                          num_labels=numBer_labels,
                                          do_lower_case=False)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].annotations.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=128,
    truncation=True, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].annotations.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=128, 
    truncation=True,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].code_frames.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].code_frames.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train), len(dataset_val)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased",
                                                        num_labels=numBer_labels,
                                                        output_attentions=False,
                                                        output_hidden_states=False
)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)


In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5,eps=1e-8)

epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    # label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
       
        print(inputs['labels'])
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'{dir}data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

In [None]:
input_ids = torch.tensor(input_ids_train) # your input_ids tensor
print(input_ids.size())

attention_mask = torch.tensor(attention_masks_train) # your attention_mask tensor
print(attention_mask.shape)

labels = torch.tensor(labels_train) # your attention_mask tensor
print(labels.shape)


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased"
                                                      num_labels=numBer_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False
                                                    )

model.to(device)

In [None]:
model.load_state_dict(torch.load(f'{dir}data_volume/finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))


In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)


In [None]:
accuracy_per_class(predictions, true_vals)


# other reference

In [None]:
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://github.com/marcellusruben/medium-resources/blob/main/Text_Classification_BERT/bert_medium.ipynb

In [None]:
# https://kyawkhaung.medium.com/multi-label-text-classification-with-bert-using-pytorch-47011a7313b9

In [None]:
# https://github.com/kyawkhaung/researcharticles/blob/main/classification/1%20Titles%20Only.ipynb

In [None]:
# https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# https://github.com/susanli2016/NLP-with-Python/blob/master/Text_Classification_With_BERT.ipynb


In [None]:
# https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=2abfwdn-g135

In [None]:
# https://medium.com/@samia.khalid/bert-explained-a-complete-guide-with-theory-and-tutorial-3ac9ebc8fa7c

In [None]:
# https://stackoverflow.com/questions/68807958/pytorch-nn-crossentropyloss-indexerror-target-2-is-out-of-bounds
# Error 