# Подготовка данных

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertConfig, BertTokenizerFast, TFAutoModel
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('C:/Storage/Dataset/Reviews/train.tsv', sep="\t")
test = pd.read_csv('C:/Storage/Dataset/Reviews/test.tsv', sep="\t")
data = pd.read_table('C:/Storage/Dataset/Reviews/train.tsv', sep='\t')
data = data[['Phrase','Sentiment']].copy()
dff=[len(i.split(" ")) for i in data.Phrase[:10]]

# Построение сети

In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    data.index.values,
    data.Sentiment.values,
    test_size=0.15,
    random_state=42,
    stratify=data.Sentiment
)

data['data_type'] = ['not_set']*data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

model_name = 'bert-base-cased'
max_length = max(dff)+3
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = BertTokenizerFast.from_pretrained(
    pretrained_model_name_or_path = model_name,
    config = config
)

input_ids = Input(
    shape=(max_length,),
    name='input_ids',
    dtype='int32'
)

attention_mask = Input(
    shape=(max_length,),
    name='attention_mask',
    dtype='int32'
)

inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask
}

bert = TFAutoModel.from_pretrained('bert-base-cased')
embeddings = bert.bert(inputs)[1]

x = Dense(1024, activation='relu')(embeddings)
y = Dense(5, activation='softmax', name='outputs')(x)

model = Model(inputs=inputs, outputs=y)

y_senti = to_categorical(data[data.data_type=='train'].Sentiment)

x = tokenizer(
    text=data[data.data_type=='train'].Phrase.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

train = tf.data.Dataset.from_tensor_slices(
    (x['input_ids'],
     x['attention_mask'],
     y_senti)
)

def map_func(input_ids, masks, labels):
    return {
               'input_ids': input_ids,
               'attention_mask': masks
           }, labels

train = train.map(map_func)
batch_size = 32

train = train.shuffle(100).batch(
    batch_size,
    drop_remainder=True
)

y_senti = to_categorical(data[data.data_type=='val'].Sentiment)

x = tokenizer(
    text=data[data.data_type=='val'].Phrase.to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True
)

val = tf.data.Dataset.from_tensor_slices(
    (x['input_ids'], x['attention_mask'], y_senti)
)

val = val.map(
    map_func
)

val = val.shuffle(100).batch(
    batch_size,
    drop_remainder=True
)

optimizer = Adam(lr=1e-5, decay=1e-6)
loss = CategoricalCrossentropy()
acc = CategoricalAccuracy('accuracy')

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[acc]
)

model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 40)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]                   



# Обучение сети

In [None]:
history = model.fit(
    train,
    validation_data=val,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Сохранение сети

In [None]:
model.save('C:/Storage/Net/predict_rate')

# Проверка

In [None]:
def prep_data(input_text):
  x = tokenizer(
      text=input_text,
      add_special_tokens=True,
      max_length=max_length,
      truncation=True,
      padding='max_length',
      return_tensors='tf',
      return_token_type_ids = False,
      return_attention_mask = True,
      verbose = True)
  
  return {
      'input_ids': tf.cast(x['input_ids'], tf.float64),
      'attention_mask': tf.cast(x['attention_mask'], tf.float64)
  }

prediction = prep_data(
    "The park is beautiful and large, but has been slightly disfigured by the new buildings." +
    "You can meet lots of squirrels and birds. There are many old squeaky rides, a disco for" +
    "the elderly, food and water make-up outlets and a rope town. In the summer, a walk with" +
    "a child is reduced to tears as you have to queue for hours: for a ticket, then to the" +
    "attraction, to the cafe. The e-ticketing system, which the media reported on, has never been implemented."
)

probs = model.predict(prediction)
np.argmax(probs[0])