In [4]:
# get data
import pandas as pd

df = pd.read_csv('../testdata/data_combined_24.csv', sep=',', names=["Category", "Sentence"])

# print number of different categories
print(len(df['Category'].unique()))
df.head()

23


Unnamed: 0,Category,Sentence
0,culture,indigenous woman chile teaches aymara ancestry...
1,culture,arabia difference etiquette wearing slippers l...
2,culture,slow look around patient means embracing thing...
3,culture,culturescountries outside usa western europe v...
4,culture,fake government birds heard right couldnt beli...


In [5]:
# create RoBERTa model
import gc
import numpy as np

from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

#find import for TFSequenceClassifierOutput


from transformers import RobertaTokenizer, RobertaConfig, TFRobertaPreTrainedModel
from transformers.models.roberta import TFRobertaMainLayer
from transformers.modeling_tf_utils import get_initializer
from transformers.modeling_tf_outputs import TFSequenceClassifierOutput

MODEL_NAME = 'roberta-base'
MAX_LEN = 128
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

#split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Sentence, df.Category, test_size=0.2, random_state=42, stratify=df.Category)

# create df combining X and y
train_df = pd.DataFrame({'text': X_train, 'target': y_train})
test_df = pd.DataFrame({'text': X_test, 'target': y_test})

train_df.head()





  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,target
28277,nutrisystem diet the dietary element of the we...,foods
30499,instead it concentrates on wider regions and z...,nature
29665,the destabilization of mercurys orbit is unlik...,disasters
30587,this book is an all encompassing guide and enc...,nature
34541,each of them so far as it is exchange value mu...,economics


In [6]:
#map labels to numbers
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['target'] = le.fit_transform(train_df['target'])
test_df['target'] = le.transform(test_df['target'])


train_df.head()

Unnamed: 0,text,target
28277,nutrisystem diet the dietary element of the we...,7
30499,instead it concentrates on wider regions and z...,14
29665,the destabilization of mercurys orbit is unlik...,3
30587,this book is an all encompassing guide and enc...,14
34541,each of them so far as it is exchange value mu...,4


In [7]:
def to_tokens(input_text, tokenizer):
    output = tokenizer.encode_plus(input_text, max_length=MAX_LEN, pad_to_max_length=True)
    return output

def select_field(features, field):
    return [feature[field] for feature in features]

In [8]:
tokenizer_output_train = train_df["text"].apply(lambda x: to_tokens(x, tokenizer))
tokenizer_output_test = test_df["text"].apply(lambda x: to_tokens(x, tokenizer))

tokenizer_output_train.head()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


28277    [input_ids, attention_mask]
30499    [input_ids, attention_mask]
29665    [input_ids, attention_mask]
30587    [input_ids, attention_mask]
34541    [input_ids, attention_mask]
Name: text, dtype: object

In [9]:
input_ids_train = np.array(select_field(tokenizer_output_train, 'input_ids'))
attention_masks_train = np.array(select_field(tokenizer_output_train, 'attention_mask'))

input_ids_test = np.array(select_field(tokenizer_output_test, 'input_ids'))
attention_masks_test = np.array(select_field(tokenizer_output_test, 'attention_mask'))

In [10]:
# create model
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = 23
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.dropout_1 = tf.keras.layers.Dropout(0.3)
        self.classifier = tf.keras.layers.Dense(units=23,
                                                name='classifier', 
                                                kernel_initializer=get_initializer(
                                                    config.initializer_range))

    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout_1(pooled_output, training=kwargs.get('training', False))
        logits = self.classifier(pooled_output)
        print(inputs)
        print("logits", logits)

        return TFSequenceClassifierOutput(
            loss=None,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            )


In [11]:
def init_model(model_name):
    config = RobertaConfig.from_pretrained(model_name, num_labels=23)
    model = TFRobertaForSequenceClassification.from_pretrained(model_name)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.BinaryAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    return model

In [12]:
BATCH_SIZE = 16
EPOCHS = 10
SPLITS = 5

y_train = tf.keras.utils.to_categorical(train_df['target'].values.reshape(-1, 1))
X_train = input_ids_train
X_test = input_ids_test
attention_masks_train = attention_masks_train
attention_masks_test = attention_masks_test
X_train = X_train[:-divmod(X_train.shape[0], BATCH_SIZE)[1]]
X_test = X_test[:-divmod(X_test.shape[0], BATCH_SIZE)[1]]
attention_masks_train = attention_masks_train[:-divmod(attention_masks_train.shape[0], 
                                                                                BATCH_SIZE)[1]]
attention_masks_test = attention_masks_test[:-divmod(attention_masks_test.shape[0],
                                                                                BATCH_SIZE)[1]]
y_train = y_train[:-divmod(y_train.shape[0], BATCH_SIZE)[1]]
model = init_model(MODEL_NAME)
print(model.summary())

model.fit([X_train, attention_masks_train], y_train, 
            validation_data=([X_test, attention_masks_test], y_test), 
            batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)

AttributeError: Exception encountered when calling layer "tf_roberta_for_sequence_classification" "                 f"(type TFRobertaForSequenceClassification).

'dict' object has no attribute 'shape'

Call arguments received by layer "tf_roberta_for_sequence_classification" "                 f"(type TFRobertaForSequenceClassification):
  • inputs={'input_ids': 'tf.Tensor(shape=(3, 5), dtype=int32)'}
  • kwargs={'training': 'None'}