In [13]:
# get data
import pandas as pd

df = pd.read_csv('../testdata/data_combined_24.csv', sep=',', names=["Category", "Sentence"])

# print number of different categories
print(len(df['Category'].unique()))
df.head()

23


Unnamed: 0,Category,Sentence
0,culture,indigenous woman chile teaches aymara ancestry...
1,culture,arabia difference etiquette wearing slippers l...
2,culture,slow look around patient means embracing thing...
3,culture,culturescountries outside usa western europe v...
4,culture,fake government birds heard right couldnt beli...


In [14]:
# create RoBERTa model
import gc
import numpy as np

from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

#find import for TFSequenceClassifierOutput


from transformers import RobertaTokenizer, RobertaConfig, TFRobertaPreTrainedModel
from transformers.models.roberta import TFRobertaMainLayer
from transformers.modeling_tf_utils import get_initializer
from transformers.modeling_tf_outputs import TFSequenceClassifierOutput

MODEL_NAME = 'roberta-base'
MAX_LEN = 128
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

#split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Sentence, df.Category, test_size=0.2, random_state=42, stratify=df.Category)

# create df combining X and y
train_df = pd.DataFrame({'text': X_train, 'target': y_train})
test_df = pd.DataFrame({'text': X_test, 'target': y_test})

train_df.head()





Unnamed: 0,text,target
28277,nutrisystem diet the dietary element of the we...,foods
30499,instead it concentrates on wider regions and z...,nature
29665,the destabilization of mercurys orbit is unlik...,disasters
30587,this book is an all encompassing guide and enc...,nature
34541,each of them so far as it is exchange value mu...,economics


In [15]:
#map labels to numbers
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['target'] = le.fit_transform(train_df['target'])
test_df['target'] = le.transform(test_df['target'])


train_df.head()

Unnamed: 0,text,target
28277,nutrisystem diet the dietary element of the we...,7
30499,instead it concentrates on wider regions and z...,14
29665,the destabilization of mercurys orbit is unlik...,3
30587,this book is an all encompassing guide and enc...,14
34541,each of them so far as it is exchange value mu...,4


In [16]:
def to_tokens(input_text, tokenizer):
    output = tokenizer.encode_plus(input_text, max_length=MAX_LEN, pad_to_max_length=True)
    return output

def select_field(features, field):
    return [feature[field] for feature in features]

In [17]:
tokenizer_output_train = train_df["text"].apply(lambda x: to_tokens(x, tokenizer))
tokenizer_output_test = test_df["text"].apply(lambda x: to_tokens(x, tokenizer))

tokenizer_output_train.head()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


28277    [input_ids, attention_mask]
30499    [input_ids, attention_mask]
29665    [input_ids, attention_mask]
30587    [input_ids, attention_mask]
34541    [input_ids, attention_mask]
Name: text, dtype: object

In [18]:
input_ids_train = np.array(select_field(tokenizer_output_train, 'input_ids'))
attention_masks_train = np.array(select_field(tokenizer_output_train, 'attention_mask'))

input_ids_test = np.array(select_field(tokenizer_output_test, 'input_ids'))
attention_masks_test = np.array(select_field(tokenizer_output_test, 'attention_mask'))

In [19]:
# create model
class TFRobertaForClassification(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = 23
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.dropout_1 = tf.keras.layers.Dropout(0.3)
        self.classifier = tf.keras.layers.Dense(units=23,
                                                name='classifier', 
                                                kernel_initializer=get_initializer(
                                                    config.initializer_range))

    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout_1(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        print(inputs)
        print("logits", logits)

        return TFSequenceClassifierOutput(
            loss=None,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            )


In [20]:
def init_model(model_name):
    config = RobertaConfig.from_pretrained(model_name, num_labels=23)
    model = TFRobertaForSequenceClassification.from_pretrained(model_name)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.BinaryAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    return model

In [21]:
BATCH_SIZE = 16
EPOCHS = 10
SPLITS = 5

y_train = tf.keras.utils.to_categorical(train_df['target'].values.reshape(-1, 1))
X_train = input_ids_train
X_test = input_ids_test
attention_masks_train = attention_masks_train
attention_masks_test = attention_masks_test
X_train = X_train[:-divmod(X_train.shape[0], BATCH_SIZE)[1]]
X_test = X_test[:-divmod(X_test.shape[0], BATCH_SIZE)[1]]
attention_masks_train = attention_masks_train[:-divmod(attention_masks_train.shape[0], 
                                                                                BATCH_SIZE)[1]]
attention_masks_test = attention_masks_test[:-divmod(attention_masks_test.shape[0],
                                                                                BATCH_SIZE)[1]]
y_train = y_train[:-divmod(y_train.shape[0], BATCH_SIZE)[1]]
model = init_model(MODEL_NAME)
print(model.summary())

model.fit([X_train, attention_masks_train], y_train, 
            validation_data=([X_test, attention_masks_test], y_test), 
            batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)

{'input_ids': <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[7, 6, 0, 0, 1],
       [1, 2, 3, 0, 0],
       [0, 0, 0, 4, 5]])>}
logits tf.Tensor(
[[ 0.11216726  0.17404541  0.12736635 -0.02573039  0.12415379 -0.01001346
  -0.06746997 -0.10868831 -0.24516112 -0.03948938  0.20892715  0.03761097
  -0.04002213 -0.1500029  -0.01764267 -0.12741596 -0.13877454  0.09873061
   0.2958059  -0.23319921  0.00972558 -0.225214   -0.16465235]
 [-0.08024059  0.22197245 -0.14469188 -0.11211964 -0.00801754 -0.02907216
   0.31553036 -0.13300958  0.04044122 -0.1305967   0.07822177  0.08970253
   0.02196234 -0.15237242 -0.20818788 -0.09217951 -0.15979713 -0.0393444
   0.3526363  -0.09553572 -0.2048147  -0.04165284  0.17857791]
 [-0.09072796  0.20122558  0.1426919  -0.04325259  0.06961422 -0.19549239
  -0.05995761  0.04561515 -0.04731005 -0.15139711  0.22724797 -0.13815494
   0.00177361 -0.17150554 -0.29828155 -0.06034704 -0.06978099 -0.19391045
   0.358995   -0.35331345 -0.03248119 -0.34755304 -0.273

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier', 'dropout_75']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[7, 6, 0, 0, 1],
       [1, 2, 3, 0, 0],
       [0, 0, 0, 4, 5]])>}
logits tf.Tensor(
[[ 0.01412885 -0.01958289  0.07729395  0.00869893  0.03801934  0.09686603
  -0.10335608  0.20350885 -0.04696433 -0.07946361  0.00821413  0.02782001
   0.07502268 -0.25246012  0.02426278 -0.03377723 -0.10900517  0.13849074
   0.01104015 -0.1218091  -0.04661133  0.03071953 -0.07071106]
 [ 0.02101029 -0.03928399  0.07305083  0.04612376 -0.00590836  0.10940999
  -0.12516652  0.16716588 -0.06738425 -0.10669763  0.00680586  0.00873994
   0.07433936 -0.2634721  -0.00730383 -0.0264577  -0.13003835  0.09280788
  -0.02674613 -0.08593298 -0.0432133   0.01598699 -0.10255025]
 [ 0.01483655 -0.01953458  0.08210661  0.00761834  0.04216928  0.09128512
  -0.09287868  0.19708171 -0.0501448  -0.09005059 -0.00193902  0.03211249
   0.0678192  -0.2462905   0.02857155 -0.03188385 -0.09969752  0.1361781
   0.01404328 -0.12275483 -0.05033511  0.03281361 -0.074

KeyboardInterrupt: 