# Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls drive

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/recommendation_systems

In [None]:
! ls

# Install transformer

https://huggingface.co/transformers/installation.html

In [None]:
! pip install transformers
# ! pip install sentencepiece
# ! pip install Torchtext==0.04

# Import modules

In [None]:
import os
import re
import time
import random
from glob import glob
import zipfile
import pickle
from pprint import pprint
import csv

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from wordcloud import WordCloud

import torch
from torch import nn
import torch.nn.functional as F
import torchtext
from transformers import AutoModel, AutoTokenizer
import torch.optim as optim

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', None)

## set random seed

In [None]:
def set_random_seed(seeds):
    random.seed(seeds)
    os.environ['PYTHONHASHSEED'] = str(seeds)
    np.random.seed(seeds)
    torch.manual_seed(seeds)
    torch.use_deterministic_algorithms(True)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seeds)
    return
set_random_seed(999)    

# Load and process data

https://msnews.github.io/

## load data

In [None]:
# train and validate (instead of test) datasets are used here.
data_path_train = "/content/drive/MyDrive/Colab Notebooks/recommendation_systems/mind_dataset/MINDlarge_train/"
data_path_test = "/content/drive/MyDrive/Colab Notebooks/recommendation_systems/mind_dataset/MINDlarge_dev/"

In [None]:
# filename_train_all = data_path_train + "news_click_df_all.csv"
filename_train_all = data_path_train + "news_click_df_select.csv"
filename_train = data_path_train + "news_click_df.csv"
data_df_train = pd.read_csv(filename_train_all)
print (data_df_train["click_prob_flag"].value_counts())
print (data_df_train["category_flag"].value_counts())
data_df_train.head(2)

In [None]:
# filename_test_all = data_path_test + "news_click_df_all.csv"
filename_test_all = data_path_test + "news_click_df_select.csv"
filename_test = data_path_test + "news_click_df.csv"
data_df_test = pd.read_csv(filename_test_all)
print (data_df_test["click_prob_flag"].value_counts())
print (data_df_test["category_flag"].value_counts())

## select data

In [None]:
data_df_train["text"] = data_df_train["title"] #+ ". " + data_df_train["abstract"]
data_df_test["text"] = data_df_test["title"] #+ ". " + data_df_test["abstract"]
data_df_train["label"] = data_df_train["click_prob_flag"] # category_flag # click_prob_flag
data_df_test["label"] = data_df_test["click_prob_flag"]

select_col = ["text","label"]
num_classes = len(data_df_train["label"].unique())

def select_data(data_df_train, select_col, filename_train):
    select_data_df_train = data_df_train[select_col].tail(int(len(data_df_train)/10))
    select_data_df_train.to_csv(filename_train, index=False)
    print (select_data_df_train["label"].value_counts())
    return select_data_df_train

select_data_df_train = select_data(data_df_train, select_col, filename_train)
select_data_df_test = select_data(data_df_test, select_col, filename_test)
select_data_df_train.head()

# DistilBERT

document: https://huggingface.co/transformers/model_doc/distilbert.html

pretrained model: https://huggingface.co/transformers/pretrained_models.html

japanese: https://github.com/BandaiNamcoResearchInc/DistilBERT-base-jp/blob/main/docs/GUIDE.md

## Tokenization

In [None]:
# #### English
select_pretrained_model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(select_pretrained_model)
distilbert_model = AutoModel.from_pretrained(select_pretrained_model)

# #### Japanese
# tokenizer_jap = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
# distilbert_jap_model = AutoModel.from_pretrained("bandainamco-mirai/distilbert-base-japanese")

In [None]:
tokenizer.tokenize("I have a new GPU!")

In [None]:
print(distilbert_model)
# print(distilbert_jap_model)

## Process train/test data

Torchtext is used for processing data.

Mainly, three steps: 
1. create Field object
2. create dataset
3. separate batches

migration tutorial: 
https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb#scrollTo=ez2lT2QO0sNj

https://github.com/mmg10/pytorch_text_new/blob/main/LSTM_text.ipynb

In [None]:
# tokenizer.encode(data_df_train["title"].values[0], return_tensors='pt')[0]

In [None]:
# train_encodings = tokenizer(data_df_train["title"].values.tolist(), truncation=True, padding=True)
# train_encodings

## old version format using legacy

In [None]:
# create Field object
def text_tokenizer(text):
    return tokenizer.encode(text, return_tensors='pt')[0]

TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=text_tokenizer, use_vocab=False, lower=False,
                            include_lengths=True, batch_first=True, pad_token=0, unk_token=0, eos_token=0)
LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

In [None]:
# read csv file and create dataset
train_dataset = torchtext.legacy.data.TabularDataset(path=filename_train, format='csv', skip_header=True,
                            fields=[('text', TEXT), ('label', LABEL)])
test_dataset = torchtext.legacy.data.TabularDataset(path=filename_test, format='csv', skip_header=True,
                            fields=[('text', TEXT), ('label', LABEL)])

# check data
for train in train_dataset:
    print (train.text, train.label)
    break

In [None]:
# seperate batches
BATCH_SIZE = 256 #32
train_iter, test_iter = torchtext.legacy.data.Iterator.splits((train_dataset, test_dataset), batch_sizes=(BATCH_SIZE, BATCH_SIZE), repeat=False, sort=False)
print (len(train_iter))

# check data
for train in train_iter:
    print (train.text, train.label)
    break

## new recent version

In [None]:
print(torch.__version__)
print(torchtext.__version__)

In [None]:
import collections
# from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor
# from torchtext.experimental.datasets.text_classification import TextClassificationDataset

def get_data_from_csv(filename_train):
    with open(filename_train, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)
        next(csv_reader)
        data = [tuple(line) for line in csv_reader]
    return data

class Tokenizer:
    def __init__(self, tokenize_fn = 'basic_english', lower = True, max_length = None):
        
        self.tokenize_fn = torchtext.data.utils.get_tokenizer(tokenize_fn)
        self.lower = lower
        self.max_length = max_length
        
    def tokenize(self, s):
        
        tokens = self.tokenize_fn(s)
        
        if self.lower:
            tokens = [token.lower() for token in tokens]
            
        if self.max_length is not None:
            tokens = tokens[:self.max_length]
            
        return tokens

def build_vocab_from_data(data, tokenizer, **vocab_kwarg):
    
    token_freqs = collections.Counter()
    
    for text, label  in data:
        tokens = tokenizer.tokenize(text)
        token_freqs.update(tokens)
        
    vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwarg)
    
    return vocab

In [None]:
def data_to_dataset(data, tokenizer, vocab):
    
    data = [(text, label) for (text, label) in data]
    
    text_transform = sequential_transforms(tokenizer.tokenize,
                                                  vocab_func(vocab),
                                                  totensor(dtype=torch.long)
                                          )
    label_transform = sequential_transforms(lambda x: 1 if x =='1' else (0 if x =='0' else x),
                                                  totensor(dtype=torch.long)
                                          )
    
    
    transforms = (text_transform, label_transform)
    
    dataset = TextClassificationDataset(data, vocab, transforms)
    
    return dataset

In [None]:
# get data
train_data = get_data_from_csv(filename_train)
print(train_data[:5])

In [None]:
# # tokenizer
max_length=2500
tokenizer = Tokenizer(max_length=max_length)

# build vocab
max_size = 25000
# vocab = build_vocab_from_data(train_data, tokenizer, max_size = max_size)
vocab = build_vocab_from_data(train_data, tokenizer)
vocab

In [None]:
# convert data to dataset
train_dataset = data_to_dataset(train_data, tokenizer, vocab)

In [None]:
train_dataset

## Set up model

In [None]:
# Creating the customised model, by adding a dense layer on top of distilbert to get the final output for the model. 
class DistilBERTClassifier(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClassifier, self).__init__()
        self.distil_bert = AutoModel.from_pretrained("distilbert-base-uncased")
        # self.pre_classifier = torch.nn.Linear(768, 768)
        # self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_classes)

        # weight initialisation
        nn.init.normal_(self.classifier.weight, std=0.02)
        nn.init.normal_(self.classifier.bias, 0)

    def forward(self, input_ids):
        output_1 = self.distil_bert(input_ids=input_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        # pooler = self.pre_classifier(pooler)
        # pooler = torch.nn.Tanh()(pooler)
        # pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return F.log_softmax(output, dim=1) # output # F.log_softmax(output)

distil_classifier = DistilBERTClassifier()

## fine tuning

In [None]:
# Turn OFF all paramters
for param in distil_classifier.parameters():
    param.requires_grad = False

# Turn ON the last layer parameter
# .transfomer.layer[-1] for DistilBERT or .encoder.layer[-1] for BERT-base
for param in distil_classifier.distil_bert.transformer.layer[-1].parameters():
    param.requires_grad = True

# Turn ON the classification part
for param in distil_classifier.classifier.parameters():
    param.requires_grad = True

# Small lr value for pretrained layer and bigger value for the last layer
optimizer = optim.Adam([
    {'params': distil_classifier.distil_bert.transformer.layer[-1].parameters(), 'lr': 5e-5},
    {'params': distil_classifier.classifier.parameters(), 'lr': 1e-4}
])

loss_function = nn.NLLLoss()

## training

In [None]:
# Set up GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Pass model to GPU
distil_classifier.to(device)
losses = []

start = time.time()
# Set up epoch 
for epoch in range(50):
    all_loss = 0
    for idx, batch in enumerate(train_iter):
        # print ("iter: ", idx)
        batch_loss = 0
        distil_classifier.zero_grad()
        input_ids = batch.text[0].to(device)
        label_ids = batch.label.to(device)
        out = distil_classifier(input_ids)
        batch_loss = loss_function(out, label_ids)
        batch_loss.backward()
        optimizer.step()
        all_loss += batch_loss.item()
    print("epoch: ", epoch, "\t" , "loss: ", all_loss)

end = time.time()
print ("time : ", end - start)

## predict

In [None]:
def plot_confusion_matrix_heatmap(true_labels, predicted_labels, title):
    # get_metrics
    print('Accuracy:', metrics.accuracy_score(true_labels, predicted_labels))
    print('Precision:', metrics.precision_score(true_labels, predicted_labels, average='weighted'))
    print('Recall:', metrics.recall_score(true_labels, predicted_labels, average='weighted'))
    print('F1 Score:', metrics.f1_score(true_labels, predicted_labels,average='weighted'))

    # confusion matrix
    labels = list(set(true_labels))
    cm = confusion_matrix(true_labels, predicted_labels, labels=labels)
    cm_labeled = pd.DataFrame(cm, columns=labels, index=labels)
    sns.heatmap(cm_labeled, annot=True, cmap='Greens', fmt='g')
    plt.title(title)
    return

def plot_roc_auc(true_labels, predicted_labels, title_name):
    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)
    fpr, tpr, _ = roc_curve(true_labels,  predicted_labels)
    print ('roc_curve-fpr:', fpr)
    print ('roc_curve-tpr:', tpr)
    auc = roc_auc_score(true_labels, predicted_labels, average=None)
    print ('roc_auc_score-auc:', auc)
    fig = plt.figure(figsize=(6,4))
    plt.plot(fpr,tpr,label="auc="+str(auc))
    plt.legend(loc=4)
    title_name = "roc and auc " + str(title_name)
    plt.titile(title_name)
    plt.show()
    plt.close(fig)
    return

def predict_texts_distilbert(batch_iter, title_name=None):
    answer = []
    prediction = []
    with torch.no_grad():
        for batch in batch_iter:

            text_tensor = batch.text[0].to(device)
            label_tensor = batch.label.to(device)

            score = distil_classifier(text_tensor)
            _, pred = torch.max(score, 1)

            prediction += list(pred.cpu().numpy())
            answer += list(label_tensor.cpu().numpy())

    # print classification report
    print(classification_report(prediction, answer))
    print("predicted label: ", set(prediction))

    # plot confusion matrix
    plot_confusion_matrix_heatmap(answer, prediction, "confusion matrix {}".format(title_name))
    try:
        plot_roc_auc(answer, prediction, title_name)
    except:
        pass    
    return

In [None]:
predict_texts_distilbert(test_iter, "test")  

In [None]:
predict_texts_distilbert(train_iter, "train")  

# Knowledge distillation

# Unmount and flush google drive

In [None]:
# from google.colab import drive
# drive.flush_and_unmount()