<a href="https://colab.research.google.com/github/kate-markina2709/NIRS/blob/master/NIRS_BERT_2linlay_5ep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 69.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

In [None]:
train_path = "train_new_data.csv"
from_train = pd.read_csv(train_path, sep=',')

path_to_test = "test_new_data.csv"
from_test = pd.read_csv(path_to_test, sep=',')

In [None]:
from_train['subreddit'].value_counts()

psbattle_artwork         13207
mildlyinteresting         6874
photoshopbattles          4495
pareidolia                3762
fakehistoryporn           2884
nottheonion               2576
upliftingnews             1931
fakealbumcovers           1746
misleadingthumbnails      1242
propagandaposters         1114
subredditsimulator         898
confusing_perspective      637
savedyouaclick             624
theonion                   624
neutralnews                605
usnews                     474
usanews                    470
pic                        463
satire                     313
waterfordwhispersnews       30
subsimulatorgpt2            21
Name: subreddit, dtype: int64

In [None]:
from_train = from_train[['clean_title', 'mark', 'mildlyinteresting', 'photoshopbattles', 'psbattle_artwork', 'pareidolia', 'nottheonion',
                     'fakealbumcovers', 'confusing_perspective', 'usnews', 'fakehistoryporn', 'propagandaposters',
                     'misleadingthumbnails', 'upliftingnews', 'savedyouaclick', 'usanews', 'theonion', 'pic',
                     'subredditsimulator', 'neutralnews', 'satire', 'waterfordwhispersnews', 'subsimulatorgpt2']]
#from_train.head()

In [None]:
from_test = from_test[['clean_title', 'mark', 'mildlyinteresting', 'photoshopbattles', 'psbattle_artwork', 'pareidolia', 'nottheonion',
                     'fakealbumcovers', 'confusing_perspective', 'usnews', 'fakehistoryporn', 'propagandaposters',
                     'misleadingthumbnails', 'upliftingnews', 'savedyouaclick', 'usanews', 'theonion', 'pic',
                     'subredditsimulator', 'neutralnews', 'satire', 'waterfordwhispersnews', 'subsimulatorgpt2']]
#from_test.head()

In [None]:
target_list = ['mildlyinteresting', 'photoshopbattles', 'psbattle_artwork', 'pareidolia', 'nottheonion',
                     'fakealbumcovers', 'confusing_perspective', 'usnews', 'fakehistoryporn', 'propagandaposters',
                     'misleadingthumbnails', 'upliftingnews', 'savedyouaclick', 'usanews', 'theonion', 'pic',
                     'subredditsimulator', 'neutralnews', 'satire', 'waterfordwhispersnews', 'subsimulatorgpt2']
#target_list

In [None]:
# hyperparameters
MAX_VAL_LEN = 128
BATCH_SIZE_T = 32
BATCH_SIZE_V = 32
TEST_BATCH_SIZE = 32
EPOCHS = 5 
LEARNING_RATE = 1e-05

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class DataCustom(torch.utils.data.Dataset):

    def __init__(self, df, bert_tokenizer, max_val_len):
        self.tokenizer = bert_tokenizer
        self.df = df
        self.title = df['clean_title']
        self.targets = self.df[target_list].values
        self.max_len = max_val_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,None, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', return_token_type_ids=True, truncation=True,
            return_attention_mask=True, return_tensors='pt')

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])}

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(from_train.index.values, from_train.mark.values, test_size=0.2, random_state=100, stratify=from_train.mark.values)

from_train['type_of_data'] = ['not_set']*from_train.shape[0]
from_train.loc[X_tr, 'type_of_data'] = 'train'
from_train.loc[X_val, 'type_of_data'] = 'test'
from_train.groupby(['mark', 'type_of_data']).count()

In [None]:
data_train = from_train[from_train.type_of_data=='train'].reset_index(drop=True)
data_val = from_train[from_train.type_of_data=='test'].reset_index(drop=True)
data_test = from_test.reset_index(drop=True)

In [None]:
data_train.head()

Unnamed: 0,clean_title,mark,mildlyinteresting,photoshopbattles,psbattle_artwork,pareidolia,nottheonion,fakealbumcovers,confusing_perspective,usnews,...,savedyouaclick,usanews,theonion,pic,subredditsimulator,neutralnews,satire,waterfordwhispersnews,subsimulatorgpt2,type_of_data
0,golden gate dust art,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
1,girl blowing out a candle,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
2,two different types of people sharing the road...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
3,happy door plate and his severely handicapped ...,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
4,httpsiimgurcomowbspjpg,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train


In [None]:
data_val.head()

Unnamed: 0,clean_title,mark,mildlyinteresting,photoshopbattles,psbattle_artwork,pareidolia,nottheonion,fakealbumcovers,confusing_perspective,usnews,...,savedyouaclick,usanews,theonion,pic,subredditsimulator,neutralnews,satire,waterfordwhispersnews,subsimulatorgpt2,type_of_data
0,please dont eat me,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,test
1,i saw others showing their finds i figured id ...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,test
2,a cheeto after weeks in the california sun,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,test
3,gon freecss long hair cosplay,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,test
4,this oddly straight banana,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,test


In [None]:
data_test.head()

Unnamed: 0,clean_title,mark,mildlyinteresting,photoshopbattles,psbattle_artwork,pareidolia,nottheonion,fakealbumcovers,confusing_perspective,usnews,...,upliftingnews,savedyouaclick,usanews,theonion,pic,subredditsimulator,neutralnews,satire,waterfordwhispersnews,subsimulatorgpt2
0,my walgreens offbrand mucinex was engraved wit...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,this concerned sink with a tiny hat,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hackers leak emails from uae ambassador to us,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,puppy taking in the view,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,i found a face in my sheet music too,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dataset_train = DataCustom(data_train, bert_tokenizer, MAX_VAL_LEN)
dataset_val = DataCustom(data_val, bert_tokenizer, MAX_VAL_LEN)
dataset_test = DataCustom(data_test, bert_tokenizer, MAX_VAL_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE_T, 
                                                shuffle=True, num_workers=0)

val_data_loader = torch.utils.data.DataLoader(dataset_val, batch_size=BATCH_SIZE_V,
                                              shuffle=False, num_workers=0)

test_data_loader = torch.utils.data.DataLoader(dataset_test, batch_size=TEST_BATCH_SIZE, 
                                                shuffle=False, num_workers=0)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device

device(type='cuda')

In [None]:
def checkp_load(check_path, model_bert, adam_optimizer):
    checkpoint = torch.load(check_path)
    model_bert.load_state_dict(checkpoint['state_dict'])
    adam_optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model_bert, adam_optimizer, checkpoint['epoch'], valid_loss_min.item()

def checkp_save(state, bast_val_bool, check_path, path_to_best_m):
    f_path = check_path
    torch.save(state, f_path)
    if bast_val_bool:
        best_fpath = path_to_best_m
        shutil.copyfile(f_path, best_fpath)
    # вне зависимости лучший или нет, контрольная точка сохранится, но без пометки "лучшая"

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(768, 300)
        self.linear1 = torch.nn.Linear(300, 21)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        ### 1 ###
        output_dropout = self.dropout(output.pooler_output)        
        output = self.linear(output_dropout)
        output = F.relu(output)
        ### 2 ###
        output = self.dropout(output)
        output = self.linear1(output)
        return output

model_bert = BERTClass()
model_bert.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
def loss_func(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

adam_optimizer = torch.optim.Adam(params = model_bert.parameters(), lr=LEARNING_RATE)

In [None]:
tar_val=[]
out_val=[]

In [None]:
def train(ep_count, tr_loader, v_loader, model_bert, 
                adam_optimizer, checkpoint_path, best_model_path):
   
  valid_loss_min = np.Inf
  for epoch in range(1, ep_count+1):
    train_loss = 0
    valid_loss = 0

    model_bert.train()
    print('Epoch {}: Training Start'.format(epoch))
    for batch_idx, data in enumerate(tr_loader):
      ids = data['input_ids'].to(device, dtype = torch.long)
      mask = data['attention_mask'].to(device, dtype = torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      outputs = model_bert(ids, mask, token_type_ids)
      adam_optimizer.zero_grad()
      loss = loss_func(outputs, targets)        
      adam_optimizer.zero_grad()
      loss.backward()
      adam_optimizer.step()
      train_loss = train_loss + loss.item()
    
    print('Epoch {}: Validation Start'.format(epoch))
    model_bert.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(v_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model_bert(ids, mask, token_type_ids)

            loss = loss_func(outputs, targets)
            valid_loss = valid_loss + loss.item()
            tar_val.extend(targets.cpu().detach().numpy().tolist())
            out_val.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      train_loss = train_loss/len(tr_loader)
      valid_loss = valid_loss/len(v_loader)
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, train_loss, valid_loss))
      
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model_bert.state_dict(),
            'optimizer': adam_optimizer.state_dict()}
      checkp_save(checkpoint, False, checkpoint_path, best_model_path)
 
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        checkp_save(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('Epoch {}  Done\n'.format(epoch))
  return model_bert

In [None]:
checkp_path = "val_ckpt"
path_to_best_model = "model_best.pt"

In [None]:
trained_model = train(EPOCHS, train_data_loader, val_data_loader, model_bert, adam_optimizer, checkp_path, path_to_best_model)

In [None]:
val_tar, val_out = [], []
val_loss = 0
model_bert.eval()
with torch.no_grad():
  for batch_idx, data in enumerate(test_data_loader, 0):
    input_ids = data['input_ids'].to(device, dtype=torch.long)
    attention_mask = data['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
    ids = data['input_ids'].to(device, dtype = torch.long)
    mask = data['attention_mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)
    outputs = model_bert(ids, mask, token_type_ids)

    loss = loss_func(outputs, targets)
    val_loss = val_loss + loss.item()
    val_tar.extend(targets.cpu().detach().numpy().tolist())
    val_out.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
val_loss = val_loss/len(test_data_loader)
print('Test loss decreased ({:.6f})'.format(val_loss))

In [None]:
# тестирование на конкретном примере 
example = data_test['clean_title'][10]
print(example)
encodings = bert_tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_VAL_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt')
model_bert.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model_bert(input_ids, attention_mask, token_type_ids)
    _output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(_output)
    print(from_train.columns[2:].to_list()[int(np.argmax(_output, axis=1))])