Small Model Improvements through Large Model.

In [1]:
import numpy as np
import pandas as pd
import re
import os
import tensorflow as tf
import plotly.express as px
import matplotlib.pyplot as plt

from datasets import load_metric, Dataset, DatasetDict
from torch.utils.data import DataLoader

import ml_collections
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer,TFAutoModel, get_linear_schedule_with_warmup, set_seed

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


### Load Datasets

In [3]:
financial_news = pd.read_csv("data/fiqa_phrasebank.csv")
financial_news["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

In [4]:
dicto = {'positive': 1, 'neutral': 0 , 'negative': -1}
financial_news.Sentiment = financial_news.Sentiment.map(dicto)
financial_news.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,1
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",-1
2,"For the last quarter of 2010 , Componenta 's n...",1
3,According to the Finnish-Russian Chamber of Co...,0
4,The Swedish buyout firm has sold its remaining...,0


In [7]:
financial_news[financial_news["Sentiment"]==1].index

Int64Index([   0,    2,    5,    8,   10,   12,   15,   16,   21,   25,
            ...
            5810, 5814, 5815, 5816, 5818, 5819, 5824, 5825, 5836, 5841],
           dtype='int64', length=1852)

In [8]:
from datetime import datetime

# current date and time
now = datetime.now()

timestamp = datetime.timestamp(now)
print("timestamp =", timestamp)

timestamp = 1658884281.666415


In [11]:
financial_news.loc[1,"Sentiment"]

-1

In [2]:
english_news = pd.read_csv("data/english_news.csv")
print(english_news.shape)
english_news.head()

(1997901, 7)


Unnamed: 0,title,published_date,source,section,language,published_date_clean,year
0,Crossing the border for greater opportunities,1970-01-01T08:00:00+08:00,South China Morning Post,business,english,1970-01-01,1970
1,Getting rid of bad blood,2008-09-12T16:00:00+00:00,The National,world,english,2008-09-12,2008
2,Tram a better alternative for Penang,2009-01-16T12:03:58+08:00,The Edge Markets,business,english,2009-01-16,2009
3,The new face of Australian wealth,2009-01-20T18:30:00+08:00,The Edge Markets,business,english,2009-01-20,2009
4,Fraud leads to cut in dental insurance,2009-02-14T16:00:00+00:00,The National,world,english,2009-02-14,2009


#### Configurations

In [37]:
def clean_dataset(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '',text) 
    text  = re.sub(r'<.*?>' ,'', text)  
    text = re.sub(r'\x89\S+' , ' ', text) #Removes string starting from \x89
    text = re.sub('\w*\d\w*', '', text)  # Removes numbers
    text = re.sub(r'[^\w\s]','',text)   # Removes Punctuations
    return text

class config:
    PATH = "../input/nlp-getting-started/"
    MAX_LEN = 36
    LOWER_CASE = True
    RANDOM_STATE = 12
    TEST_SIZE = 0.2
    VALIDATION_SIZE = 0.1
    NUM_LABELS = 1
    BATCH_SIZE = 128
    LEARNING_RATE = 5e-5
    EPOCHS = 10
    WEIGTH_DECAY = 0.01
    DEVICE = "cuda"


In [44]:
def create_dataset(dataframe, dictionary=False):
    """ split pandas dataframe into train set & test set and stored them in dictionary
    Params:
        dataframe (Pandas DataFrame) : 
        
    Returns:
        dictionary : keys (train, validation, test), values (the sets)

    """
    training_df, test_df = train_test_split(
        dataframe,
        test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE,
    )
    if dictionary:
        dataset = {
            "train": Dataset.from_pandas(training_df),
            "test": Dataset.from_pandas(test_df),
        }

        dataset = DatasetDict(dataset)
        return dataset
    else: 
        return training_df, test_df

train, test = create_dataset(financial_news)

In [45]:
MODEL_1 = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_1 , do_lower_case = config.LOWER_CASE , max_length = config.MAX_LEN )
x_train = tokenizer(
        text = train["Sentence"].to_list(),
        add_special_tokens = True,
        max_length = config.MAX_LEN,
        truncation = True,
        padding = True,
        return_tensors = "tf",
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True
        )

x_test = tokenizer(
        text = test["Sentence"].to_list(),
        add_special_tokens = True,
        max_length = config.MAX_LEN,
        truncation = True,
        padding = True,
        return_tensors = "tf",
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True
        )

### Model Building

In [46]:
bert_based_uncased = TFAutoModel.from_pretrained(MODEL_1)
input_ids = tf.keras.layers.Input(shape = (config.MAX_LEN,) , dtype = tf.int32 , name = "input_ids")
input_mask = tf.keras.layers.Input(shape = (config.MAX_LEN,) , dtype = tf.int32 , name = "attention_mask")
embeddings = bert_based_uncased(input_ids , attention_mask = input_mask)[1]
x = tf.keras.layers.Dropout(0.3)(embeddings)
x = tf.keras.layers.Dense(128 , activation = "relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32 , activation = "relu")(x)
output = tf.keras.layers.Dense(config.NUM_LABELS , activation = "sigmoid")(x)

model_1 = tf.keras.Model(inputs = [input_ids , input_mask] , outputs = output)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Callback

In [47]:
if  os.path.isdir("./weights/bert_base_uncased_weights") is None:
          os.makedirs("./weights/bert_base_uncased_weights")
checkpoint_filepath_bert_base_uncased  = "./weights/bert_base_uncased_weights"
checkpoint_callback_bert_base_uncased = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_filepath_bert_base_uncased,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

### Compile

In [48]:
model_1.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), 
             optimizer = tf.keras.optimizers.Adam(lr = config.LEARNING_RATE , epsilon = 1e-8 , decay  =config.WEIGTH_DECAY , clipnorm = 1.0),
             metrics = ["accuracy"])

  super(Adam, self).__init__(name, **kwargs)


### Training

In [49]:
bert_based_uncased_history  = model_1.fit(x = {"input_ids": x_train["input_ids"] , "attention_mask" : x_train["attention_mask"]},
                y = train["Sentiment"] , 
                epochs = config.EPOCHS , 
                validation_split = 0.2,
                batch_size = 256 , callbacks = [checkpoint_callback_bert_base_uncased])

Epoch 1/10


  return dispatch_target(*args, **kwargs)


In [None]:
model_1.load_weights(checkpoint_filepath_bert_base_uncased)

In [None]:
bert_based_uncased_hist_df = pd.DataFrame(bert_based_uncased_history.history , columns = ['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [None]:
fig = px.line(bert_based_uncased_hist_df, y=["accuracy" , "val_accuracy"], title="Accuracy") 
fig.update_xaxes(title="Epochs")
fig.update_yaxes(title = "Accuracy")
fig.update_layout(showlegend = True,
        title = {
            'text': "Bert Base uncased Accuracy",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})
fig.show()

In [None]:
def model_config():
    cfg_dictionary = {
        "data_path": "data/fiqa_phrasebank.csv",
        "model_path": "/kaggle/working/bert_model.h5",
        "model_type": "transformer",

        "test_size": 0.1,
        "validation_size":0.2,
        "train_batch_size": 32,
        "eval_batch_size": 32,

        "epochs": 5,
        "adam_epsilon": 1e-8,
        "lr": 3e-5,
        "num_warmup_steps": 10,

        "max_length": 128,
        "random_seed": 42,
        "num_labels": 3,
        "model_checkpoint":"roberta-base",
    }

    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)
    return cfg

cfg = model_config()

def clean_text(df,field):
    df[field] = df[field].str.replace(r"http\S+"," ") #Removes Websites
    df[field] = df[field].str.replace(r"http"," ") #Removes Websites
    df[field] = df[field].str.replace(r"@","at") 
    df[field] = df[field].str.replace("#[A-Za-z0-9_]+", ' ')
    df[field] = df[field].str.replace(r"[^A-Za-z(),!?@\'\"_\n]"," ")
    df[field] = df[field].str.lower()
    return df 

def preprocess_csv(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(csv_file)

    labelencoder = LabelEncoder()
    df["label_enc"] = labelencoder.fit_transform(df["Sentiment"])
    df.rename(columns={"label": "label_desc"}, inplace=True)
    df.rename(columns={"label_enc": "labels"}, inplace=True)
    df.drop_duplicates(subset=['Sentence'],keep='first',inplace=True) #drop duplicates

    cleaned_df = clean_text(df, "Sentence")
    return cleaned_df




In [10]:
small_df = english_news.iloc[:100,:]
headlines = list(map(clean_dataset, small_df["title"].to_list()))

classifier = pipeline("sentiment-analysis", model="bert-base-uncased")
classifier("I love my day")

Downloading: 100%|██████████| 420M/420M [03:21<00:00, 2.18MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenc

[{'label': 'LABEL_0', 'score': 0.5358694195747375}]

In [None]:
distilbert = pipeline("sentiment-analysis", model = "distilbert-base-uncased")
