In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import torch 
import transformers

import transformers
import torch.nn as nn
from torch.utils.data import TensorDataset
from transformers import BertTokenizer,BertModel
import torch.nn.functional as F


from torch.utils.data import DataLoader,Dataset
from torch.nn.utils.rnn import pack_padded_sequence
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR

In [3]:
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing

In [4]:
# from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import DebertaTokenizer

In [5]:
import torch

In [6]:
train_df = pd.read_csv(f"../data/fe_data/fe_train.csv")
test_df = pd.read_csv(f"../data/fe_data/fe_test.csv")

In [7]:
submission_df = pd.read_csv(f"../data/stumbleupon/sampleSubmission.csv", sep=',')

In [8]:
train_df["total_text"] = train_df["total_text"].str.lower()
test_df["total_text"] = test_df["total_text"].str.lower()


text_features = ["total_text"]
xtrain = train_df[text_features + ["label"]]

xtest = test_df[text_features]

In [9]:
import math
print(torch.backends.mps.is_available())

print(torch.backends.mps.is_built())

True
True


In [10]:
config = {  "model_name" :"bert-base-uncased",
            "TOKENIZER"  :BertTokenizer.from_pretrained("bert-base-uncased", 
                                                        do_lower_case=True,),
            "MAX_LEN" : 512, #128, #64
            "TRAIN_BATCH_SIZE" : 2,  #6
            "EPOCHS" : 2,
            "DEVICE" : "mps",
            "MODEL_PATH":"model.pth"
          }

In [11]:
class BERTDataset:
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = config["TOKENIZER"]
        self.max_len = config["MAX_LEN"]

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
#             pad_to_max_length=True,
            truncation=True,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if("token_type_ids" in inputs.keys()):
            token_type_ids = inputs["token_type_ids"]
            return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }
        else:
            return {
                "ids": torch.tensor(ids, dtype=torch.long),
                "mask": torch.tensor(mask, dtype=torch.long),
                "targets": torch.tensor(self.target[item], dtype=torch.float),
            }

class BERT_Test_Dataset:
    def __init__(self, text):
        self.text = text
        self.tokenizer = config["TOKENIZER"]
        self.max_len = config["MAX_LEN"]

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
#             pad_to_max_length=True,
            truncation=True,
            padding='max_length'
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if("token_type_ids" in inputs.keys()):
            token_type_ids = inputs["token_type_ids"]
            return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
        else:
            return {
                "ids": torch.tensor(ids, dtype=torch.long),
                "mask": torch.tensor(mask, dtype=torch.long),
            }

In [12]:
from transformers import BertModel
class TextClassification(nn.Module):

    def __init__(self):
        super(TextClassification, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(pretrained_model_name_or_path =config["model_name"], return_dict=False, )
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [13]:
import torch
import torch.nn as nn
from tqdm.notebook import tqdm


def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler, epoch,fold):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(enumerate(data_loader), 
                        total=len(data_loader),
                        desc='OOF {:1d} Epoch {:1d}'.format(int(fold) , epoch), 
                        leave=False, 
                        disable=False)    

    for bi, d in  progress_bar:
        ids = d["ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        if("token_type_ids" in d.keys()):
            token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        else:
            outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss_train_total +=loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.5f}'.format(loss.item()/len(targets))})
    loss_train_avg = loss_train_total/len(data_loader)
    return loss_train_avg 


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    loss_total = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
        
            
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            if("token_type_ids" in d.keys()):
                token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
                outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            else:
                outputs = model(ids=ids, mask=mask)
            
                loss_total = loss_total + loss_fn(outputs, targets).item()
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            loss_total = loss_total / len(data_loader)
    return loss_total,fin_outputs, fin_targets

def eval_test(data_loader, model, device):
    model.eval()
    fin_outputs = []
    progress_bar = tqdm(enumerate(data_loader), 
                        total=len(data_loader),
                        desc='Generating Test Output'.format(epoch), 
                        leave=False, 
                        disable=False)
    with torch.no_grad():
        for bi, d in progress_bar:
            ids = d["ids"]
            mask = d["mask"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            if("token_type_ids" in d.keys()):
                token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
                outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            else:
                outputs = model(ids=ids, mask=mask)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs

In [14]:
from sklearn.model_selection import train_test_split

def get_data_loaders(x_train,x_valid):
#     x_train , x_valid = train_test_split(train, test_size=0.1,random_state=2020)
    train_dataset = BERTDataset(text=x_train.total_text.values, target=x_train.label.values)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = config["TRAIN_BATCH_SIZE"],shuffle=True)
    
    valid_dataset = BERTDataset(text=x_valid.total_text.values, target=x_valid.label.values)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size = config["TRAIN_BATCH_SIZE"],shuffle=True)
    
    return train_loader , valid_loader

def get_test_data_loaders(x_test):
#     x_train , x_valid = train_test_split(train, test_size=0.1,random_state=2020)
    test_dataset = BERT_Test_Dataset(text=x_test.total_text.values)
    test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = config["TRAIN_BATCH_SIZE"],shuffle=False)
    
    return test_loader 

In [15]:
device = torch.device(config["DEVICE"])
num_folds = 2
kf = model_selection.StratifiedKFold(n_splits=num_folds)

In [16]:


# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=xtrain, y=xtrain["label"])):
    xtrain.loc[v_, 'kfold'] = f

val_auc = 0
y_test_pred = []
for fold in xtrain.kfold.unique():

    model = TextClassification()
    model = nn.DataParallel(model)
    model.to(device)
    x_train = xtrain.loc[xtrain.kfold != fold,:].reset_index(drop=True)
    x_valid = xtrain.loc[xtrain.kfold == fold,:].reset_index(drop=True)

    train_data_loader , valid_data_loader = get_data_loaders(x_train,x_valid)

    num_train_steps = int(len(x_train) / config["TRAIN_BATCH_SIZE"] * config["EPOCHS"])
    optimizer = AdamW(model.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )


    
    for epoch in tqdm(range(config["EPOCHS"])):

        loss_train_avg = train_fn(train_data_loader, model, optimizer, device, scheduler,epoch,fold)
        tqdm.write(f'\nEpoch {epoch}')
        tqdm.write(f'Training loss: {loss_train_avg}')
        val_loss,outputs, targets = eval_fn(valid_data_loader, model, device)
        auc = metrics.roc_auc_score(targets, outputs)     
        tqdm.write(f'Validation Loss: {val_loss}')
        tqdm.write(f'AUC : {auc}')        

    val_auc = val_auc + auc
    test_data_loader = get_test_data_loaders(xtest)
    outputs = eval_test(test_data_loader, model, device)
    y_test_pred.append(outputs)
    tqdm.write(f"OOF -- {fold} ROC AUC Score = {auc}") 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[v_, 'kfold'] = f


  0%|          | 0/2 [00:00<?, ?it/s]

OOF 0 Epoch 0:   0%|          | 0/1849 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.4735103784609544


  0%|          | 0/1849 [00:00<?, ?it/s]

Validation Loss: 0.0
AUC : 0.8710771572415408


OOF 0 Epoch 1:   0%|          | 0/1849 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.38381016951431096


  0%|          | 0/1849 [00:00<?, ?it/s]

Validation Loss: 0.0
AUC : 0.872728017796511


Generating Test Output:   0%|          | 0/1586 [00:00<?, ?it/s]

OOF -- 0.0 ROC AUC Score = 0.872728017796511


  0%|          | 0/2 [00:00<?, ?it/s]

OOF 1 Epoch 0:   0%|          | 0/1849 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.49307335186054607


  0%|          | 0/1849 [00:00<?, ?it/s]

Validation Loss: 0.0
AUC : 0.8811976094903444


OOF 1 Epoch 1:   0%|          | 0/1849 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.40189161261049006


  0%|          | 0/1849 [00:00<?, ?it/s]

Validation Loss: 0.0
AUC : 0.8850722594392975


Generating Test Output:   0%|          | 0/1586 [00:00<?, ?it/s]

OOF -- 1.0 ROC AUC Score = 0.8850722594392975


In [17]:
val_auc_avg = val_auc / 4
print(f"Total Val AUC -- {val_auc_avg}")

Total Val AUC -- 0.43945006930895214


In [18]:
model

DataParallel(
  (module): TextClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)


In [19]:
FILE = 'model_v1.pt'
torch.save(model, FILE)

In [20]:
#load model
#FILE = 'model_v1.pt'
#model = torch.load(FILE)

In [21]:
submission_df["label"] = 0
bert_pred = 0
for pred in y_test_pred:
    bert_pred += np.array(pred).flatten()


In [22]:
submission_df["label"] = bert_pred
submission_df.to_csv(f"../data/submission/submission_bert.csv", index = False)