In [1]:
# 구글 번역 패키지
!pip install -q googletrans==4.0.0-rc1

In [2]:
import gc 
import os
import random
import transformers
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K

from googletrans import Translator
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel

print(f"TensorFlow version: {tf.__version__}")
print(f"Transformers version: {transformers.__version__}")
# 노트북 상 버전은 2.2.0,, 3.0.2

warnings.filterwarnings("ignore")

TensorFlow version: 2.11.0
Transformers version: 3.0.2


# Configuration

- configuration class is setup to define as many levers required for experiments as possible. Following things are changable by changing only one line of code.
    - different huggingface models with Tensorflow
    - different hyperparameter spaces for models
    - different seeds, splits, accelerators
    - different learning rate schedulers (WIP)

In [13]:
class Configuration():
    
    def __init__(
        self,
        model_name,
        translation=True,
        max_length=64,
        padding=True,
        batch_size=128,
        epochs=5,
        learning_rate=1e-5,
        metrics=["sparse_categorical_accuracy"],
        verbose=1,
        train_splits=5,
        accelerator="TPU",
        myluckynumber=13
    ):
        self.SEED = myluckynumber
        self.ACCELERATOR = accelerator
        
        # from pathlib import Path
        self.PATH_TRAIN = Path("./train.csv")
        self.PATH_TEST = Path("./test.csv")
        
        self.TRAIN_SPLITS = train_splits
        
        self.LANGUAGE_MAP = {
            "English":0,
            "Chinese":1,
            "Arabic":2,
            "French":3,
            "Swahili":4,
            "Urdu":5,
            "Vietnamese":6,
            "Russian":7,
            "Hindi":8,
            "Greek":9,
            "Thai":10,
            "Spanish":11,
            "German":12,
            "Turkish":13,
            "Bulgarian":14
        }
        
        self.INVERSE_LANGUAGE_MAP = {v:k for k,v in self.LANGUAGE_MAP.items()}
        
        # model configuration
        self.MODEL_NAME = model_name
        self.TRANSLATION = translation
        # from transformers import AutoTokenizer
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_NAME)
        
        # model hyperparameters
        self.MAX_LENGTH = max_length
        self.PAD_TO_MAX_LENGTH = padding
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epochs
        self.LEARNING_RATE = learning_rate
        self.METRICS = metrics
        self.VERBOSE = verbose
        
        # initializing accelerator
        self.initialize_accelerator()
        
    def initialize_accelerator(self):
        
        # checking TPU first
        if self.ACCELERATOR == "TPU":
            print("Connecting to TPU")
            try:
                tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
                print(f"Running on TPU {tpu.master()}")
            except ValueError:
                print("Could not connect to TPU")
                tpu = None
                
            if tpu:
                try:
                    print("Initializing TPU")
                    tf.config.experimental_connect_to_cluster(tpu)
                    tf.tpu.experimental.initialize_tpu_system(tpu)
                    self.strategy = tf.distribute.experimental.TPUStrategy(tpu)
                    self.tpu = tpu
                    print("TPU initialized")
                except _:
                    print("Failed to initialize TPU")
            else:
                print("Unable to initialize TPU")
                self.ACCELERATOR = "GPU"
        
        # default for CPU and GPU
        if self.ACCELERATOR != 'TPU':
            print("Using default strategy for CPU and single GPU")
            self.strategy = tf.distribute.get_strategy()
            
        # checking GPUs
        if self.ACCELERATOR == "GPU":
            print(f"GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")
            
        # defining replicas(복제품)
        self.AUTO = tf.data.experimental.AUTOTUNE
        self.REPLICAS = self.strategy.num_replicas_in_sync
        print(f"REPLICAS: {self.REPLICAS}")

# Data Preparation
- Google Translator 이용 전부 영어로 바꾸기 
- tf.data.Dataset으로 데이터 변환

In [9]:
def translate_text_to_english(text):
    translator = Translator()
    return translator.translate(text, dest='en').text

def encode_text(df, tokenizer, max_len, padding):
    text = df[['premise','hypothesis']].values.tolist()
    
    text_encoded = tokenizer.batch_encode_plus(
        text,
        pad_to_max_length = padding,
        max_length = max_len)
    
    return text_encoded

def get_tf_dataset(x,y,auto,labeled=True,repeat=False,shuffle=False,batch_size=128):
    """ creating tf.data.Dataset for TPU """
    
    if labeled:
        ds = (tf.data.Dataset.from_tensor_slices((x['input_ids'],y)))
    else:
        ds = (tf.data.Dataset.from_tensor_slices(x['input_ids']))
        
    if repeat:
        ds = ds.repeat()
        
    if shuffle:
        ds = ds.shuffle(2048)
        
    ds = ds.batch(batch_size)
    ds = ds.prefetch(auto)
    
    return ds

# Deep Learning model architecture

In [10]:
def build_model(model_name, max_len, learning_rate, metrics):
    input_ids = Input(shape=(max_len,),dtype=tf.int32,name='input_ids')
    
    transformer_model = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings = transformer_model(input_ids)[0]
    
    output_values = Dense(3, activation='softmax')(transformer_embeddings[:,0,:])
    
    # 모델 정의
    model = Model(inputs=input_ids, outputs=output_values)
    opt = Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics=metrics
    
    model.compile(optimizer=opt, loss=loss, metrics=metrics)
    
    return model

# stratified k-fold modelling

In [19]:
def run_model(config):
    df_train = pd.read_csv(config.PATH_TRAIN)
    df_test = pd.read_csv(config.PATH_TEST)
    
    if config.TRANSLATION:
        df_train.loc[df_train.language != 'English','premise'] =\
        df_train[df_train.language != 'English'].premise.apply\
        (lambda x:translate_text_to_english(x))
        
        df_test.loc[df_test.language != 'English','premise'] =\
        df_test[df_test.language != 'English'].premise.apply\
        (lambda x:translate_text_to_english(x))
        
        df_train.loc[df_train.language != 'English','hypothesis'] =\
        df_train[df_train.language != 'English'].hypothesis.apply\
        (lambda x:translate_text_to_english(x))
        
        df_test.loc[df_test.language != 'English','hypothesis'] =\
        df_test[df_test.language != 'English'].hypothesis.apply\
        (lambda x:translate_text_to_english(x))
        
    # adding column for stratified splitting
    df_train['language_label'] = df_train.language.astype(str)+'_'+df_train.label.astype(str)

    # stratified k-fold on language and label
    skf = StratifiedKFold(n_splits=config.TRAIN_SPLITS, shuffle=True, random_state=config.SEED)

    # initializing predictions
    preds_oof = np.zeros((df_train.shape[0], 3))
    preds_test = np.zeros((df_test.shape[0], 3))        
    acc_oof = []

    # iterating over folds
    for (fold, (train_index,valid_index)) in enumerate(skf.split(df_train,df_train.language_label)):

        # initializing TPU
        if config.ACCELERATOR == 'TPU':
            if config.tpu:
                config.initialize_acceletor()

        # building model
        # from tensorflow.keras.backend import K
        K.clear_session()
        with config.strategy.scope():
            model = build_model(config.MODEL_NAME, 
                                config.MAX_LENGTH, 
                                config.LEARNING_RATE, 
                                config.METRICS)
            if fold == 0:
                print(model.summary())

        print('\n')
        print('#'*19)
        print(f"##### Fold: {fold + 1} #####")
        print('#'*19)

        x_train = df_train.iloc[train_index]
        x_valid = df_train.iloc[valid_index]

        y_train = x_train.label.values
        y_valid = x_valid.label.values

        print("\nTokenizing")

        x_train_encoded = encode_text(df=x_train, 
                                      tokenizer=config.TOKENIZER,
                                      max_len=config.MAX_LENGTH,
                                      padding=config.PAD_TO_MAX_LENGTH)

        x_valid_encoded = encode_text(df=x_valid, 
                                      tokenizer=config.TOKENIZER,
                                      max_len=config.MAX_LENGTH,
                                      padding=config.PAD_TO_MAX_LENGTH)

        ds_train = get_tf_dataset(x_train_encoded, y_train, config.AUTO,
                                 repeat=True,shuffle=True,
                                  batch_size=config.BATCH_SIZE*config.REPLICAS)
        ds_valid = get_tf_dataset(x_valid_encoded, y_valid, config.AUTO,
                                 batch_size=config.BATCH_SIZE*config.REPLICAS * 4)

        n_train = x_train.shape[0]

        if fold == 0:
            x_test_encoded = encode_text(df=df_test,
                                        tokenizer=config.TOKENIZER,
                                        max_len=config.MAX_LENGTH,
                                        padding=config.PAD_TO_MAX_LENGTH)

            sv = tf.keras.callbacks.ModelCheckpoint(
                "model.h5",
                monitor="val_sparse_categorical_accuracy",
                verbose=0,
                save_best_only=True,
                save_weights_only=True,
                mode="max",
                save_freq="epoch")

            print("\nTraining")

            model_history = model.fit(
                ds_train,
                epochs=config.EPOCHS,
                callbacks=[sv],
                steps_per_epoch=n_train/config.BATCH_SIZE//config.REPLICAS,
                validation_data=ds_valid,
                verbose=config.VERBOSE)

            print("\nValidating")

            # scoring validation data
            model.load_weights("model.h5")
            ds_valid = get_tf_dataset(x_valid_encoded,-1,config.AUTO,
                                     labeled=False,
                                      batch_size=config.BATCH_SIZE*config.REPLICAS*4)

            preds_valid = model.predict(ds_valid, verbose=config.VERBOSE)
            acc = accuracy_score(y_valid, np.argmax(preds_valid, axis=1))

            preds_oof[valid_index] = preds_valid
            acc_oof.append(acc)

            print("\nInferencing")

            # scoring test data
            ds_test = get_tf_dataset(x_test_encoded,-1,config.AUTO,
                                    labeled=False,
                                    batch_size=config.BATCH_SIZE*config.REPLICAS*4)

            preds_test += model.predict(ds_test, verbose=config.VERBOSE)/config.TRAIN_SPLITS

            print(f"\nFold {fold+1} Accuracy: {round(acc,4)}\n")

            g = gc.collect()

    # overall CV score and standard deviation
    print(f"\nCV Mean Accuracy: {round(np.mean(acc_oof), 4)}")
    print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}\n")

    return preds_oof, preds_test

# Experimenting with different models

In [20]:
# Final Model: XLM Roberta Large
config_1 = Configuration("jplu/tf-xlm-roberta-large",
                        translation=False,
                        max_length=84,
                        batch_size=64,
                        epochs=16,
                        train_splits=4)
preds_train_1, preds_test_1 = run_model(config_1)

Connecting to TPU
Could not connect to TPU
Unable to initialize TPU
Using default strategy for CPU and single GPU
GPUs Available: 0
REPLICAS: 1


Downloading:   0%|          | 0.00/3.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at jplu/tf-xlm-roberta-large were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-large.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, 84)]              0         
                                                                 
 tf_roberta_model (TFRoberta  ((None, 84, 1024),       559890432 
 Model)                       (None, 1024))                      
                                                                 
 tf.__operators__.getitem (S  (None, 1024)             0         
 licingOpLambda)                                                 
                                                                 
 dense (Dense)               (None, 3)                 3075      
                                                                 
Total params: 559,893,507
Trainable params: 559,893,507
Non-trainable params: 0
_________________________________________________________________


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


None


###################
##### Fold: 1 #####
###################

Tokenizing


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Training
Epoch 1/16
  1/142 [..............................] - ETA: 13:51:15 - loss: 1.2114 - sparse_categorical_accuracy: 0.2656

KeyboardInterrupt: 

- 한 epoch당 13:51:15 걸림... ㅋㅋㅋ