In [62]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from transformers import AlbertTokenizer, TFAlbertModel, TFAlbertForSequenceClassification
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [63]:
df = pd.read_csv('/content/train.csv')

In [64]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,olympic torch used to ignite tibetan protesters,1
1,this 594-foot-high basketball shot 'for mankin...,0
2,"dr. oz, mel gibson, & congress called out usin...",0
3,excited juror feels like murder trial being pu...,1
4,man has mixed feelings about $39 flight,1


In [70]:
df.shape

(24038, 2)

## **Tokenize Inputs**

In [66]:
albert_name = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(albert_name,
    add_special_tokens=True,
    do_lower_case=False,
    max_length=80,
    pad_to_max_length=True)

In [67]:
def albert_encoder(review):
    encoded = tokenizer.encode_plus(review, add_special_tokens=True,
        max_length=80,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=True)
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [71]:
albert_train = [albert_encoder(df.iloc[i, 0]) for i in range(df.shape[0])]
albert_train = np.array(albert_train)

albert_label = df['is_sarcastic'].to_list()
albert_label = np.array(albert_label)
albert_label = to_categorical(albert_label, num_classes=2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [72]:
x_train, x_val, y_train, y_val = train_test_split(albert_train,
    albert_label,
    test_size=0.1,
    random_state=42)
print(x_train.shape, y_train.shape)

(21634, 3, 80) (21634, 2)


In [73]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)
print(tr_reviews.shape)

(21634, 1, 80)


In [74]:
tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

print(tr_reviews.shape)

(21634, 80)


In [75]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
    return {"input_ids": input_ids,"attention_mask": attention_masks,"token_type_ids": token_type_ids},y

In [76]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews, tr_masks, tr_segments, y_train)).map(example_to_features).shuffle(100).batch(8)
valid_ds = tf.data.Dataset.from_tensor_slices((val_reviews,val_masks, val_segments, y_val)).map(example_to_features).shuffle(100).batch(8)

## **Train Model**

In [77]:
albert_model_2 = TFAlbertForSequenceClassification.from_pretrained(albert_name)

All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
optimizer = Adam(learning_rate=2e-5)
loss = BinaryCrossentropy(from_logits=True)
albert_model_2.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [79]:
albert_model_2.summary()

Model: "tf_albert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
albert (TFAlbertMainLayer)   multiple                  11683584  
_________________________________________________________________
dropout_22 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 11,685,122
Trainable params: 11,685,122
Non-trainable params: 0
_________________________________________________________________


In [80]:
print("Fine-tuning ALBERT")
albert_history = albert_model_2.fit(train_ds, epochs=3, validation_data=valid_ds)

Fine-tuning ALBERT
Epoch 1/3
Epoch 2/3
Epoch 3/3


## **Evaluate on test data**

In [81]:
test_df = pd.read_csv('/content/test.csv')

In [82]:
test_df.shape

(2671, 2)

In [84]:
X_test = [albert_encoder(test_df.iloc[i, 0]) for i in range(test_df.shape[0])]
X_test = np.array(X_test)

y_test = test_df['is_sarcastic'].to_list()
y_test = np.array(y_test)
y_test = to_categorical(y_test, num_classes=2)



In [85]:
ts_reviews, ts_segments, ts_masks = np.split(X_test, 3, axis=1)
print(ts_reviews.shape)

(2671, 1, 80)


In [86]:
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

print(ts_reviews.shape)

(2671, 80)


In [87]:
test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews, ts_masks, ts_segments, y_test)).map(example_to_features).shuffle(100).batch(8)

In [88]:
albert_model_2.evaluate(test_ds)



[0.2967062294483185, 0.8910520672798157]