In [None]:
from pathlib import Path
import os
import pandas as pd
import re
import numpy as np

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [None]:
from transformers import BertTokenizerFast, BertModel, AdamW, get_linear_schedule_with_warmup

In [None]:
from project_path import project_path
project_path(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc, precision_recall_fscore_support

In [None]:
# import from src
import src.models.models as models
import src.models.data_loader as data_loader
import src.models.training as training

In [None]:
import pickle

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
# if gpu is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
# data directories
proj_dir = Path.cwd().parents[0]
data_raw = Path(proj_dir, 'data', 'raw')

In [None]:
data_fake = pd.read_csv(Path(data_raw, 'Fake.csv'))
data_true = pd.read_csv(Path(data_raw, 'True.csv'))

In [None]:
data_fake['target'] = 1
data_true['target'] = 0

In [None]:
data = pd.concat([data_fake, data_true])
data_sample, _ = train_test_split(data, train_size=0.1, random_state=RANDOM_SEED, shuffle=True, stratify=data['target'])
data_sample = data_sample.rename(columns = {'text':'content'})

#### Train test split

In [None]:
df_train, df_test = train_test_split(
    data_sample, 
    train_size=0.8, 
    random_state=RANDOM_SEED, 
    shuffle=True,
    stratify=data_sample['target'])

In [None]:
df_val, df_test = train_test_split(
    df_test, 
    train_size=0.5, 
    random_state=RANDOM_SEED, 
    shuffle=True, 
    stratify=df_test['target'])

In [None]:
print(df_train.shape, df_val.shape, df_test.shape)

In [None]:
print(df_train.target.value_counts(normalize = True))

#### Train model

In [None]:
PRE_TRAINED_MODEL_NAME = "D:/Users/Nicholas/Projects/BERT_pretrained/biobert-base-cased-v1.1"
tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
max_len = 512 * 3
batch_size = 32

In [None]:
train_data_loader = data_loader.create_data_loader(
    df = df_train, 
    tokenizer = tokenizer, 
    max_len = max_len, 
    batch_size = batch_size, 
    chunksize = 512, 
    sampler = None, 
    shuffle = True, 
    drop_last = True)

val_data_loader = data_loader.create_data_loader(
    df = df_val, 
    tokenizer = tokenizer, 
    max_len = max_len, 
    batch_size = batch_size, 
    chunksize = 512, 
    sampler = None, 
    shuffle = False, 
    drop_last = False)

test_data_loader = data_loader.create_data_loader(
    df = df_test, 
    tokenizer = tokenizer, 
    max_len = max_len, 
    batch_size = batch_size, 
    chunksize = 512, 
    sampler = None, 
    shuffle = False, 
    drop_last = False)

In [None]:
params_dict = {
    'PRE_TRAINED_MODEL_NAME':PRE_TRAINED_MODEL_NAME, 
    'n_classes':2, 
    'add_linear':[512,256], 
    'attn_bias':False, 
    'freeze_layer_count':8
}

In [None]:
model = models.HIBERT(**params_dict)

In [None]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

In [None]:
epochs = 1
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)

total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
%%time
history = training.train_model(epochs, model, train_data_loader, val_data_loader, loss_fn, optimizer, device, scheduler)

In [None]:
pickle.dump(history, open('training_history.pkl', 'wb'))

#### plot history

In [None]:
history = pickle.load(history, open('training_history.pkl', 'rb'))

In [None]:
plt.plot(list(range(epochs)), history['train_loss'], label = 'train loss')
plt.plot(list(range(epochs)), history['val_loss'], label = 'val loss')
plt.legend()

In [None]:
plt.plot(list(range(epochs)), history['train_f1'], label = 'train f1')
plt.plot(list(range(epochs)), history['val_f1'], label = 'val f1')
plt.legend()

In [None]:
plt.plot(list(range(epochs)), history['train_auc'], label = 'train auc')
plt.plot(list(range(epochs)), history['val_auc'], label = 'val auc')
plt.legend()

#### make predictions

In [None]:
model_trained = models.HIBERT(**params_dict)
model_trained.load_state_dict(torch.load('best_model_state.bin'))

best_threshold = pickle.load(history, open('best_threshold.pkl', 'rb'))

In [None]:
predictions = training.pred_model(
    model = model_trained, 
    data_loader = test_data_loader, 
    device = device, 
    best_threshold = best_threshold)