<a href="https://colab.research.google.com/github/jenghub/test/blob/master/ibc_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import itertools

import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
df_binary = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/ibc_lib_con.xlsx")
df_binary = df_binary[df_binary['class'].notna()]
df_binary["label"] = np.where(df_binary["class"]==0, "liberal", "conservative")

print("Liberal (0) and Conservative (1) sentence counts:")
df_binary["class"].value_counts()

In [None]:
features = df_binary[["sentence"]]
labels = df_binary["class"]

In [None]:
undersample = RandomUnderSampler(sampling_strategy="majority")
X_under, y_under = undersample.fit_resample(features, labels)

# To check balance
from collections import Counter
print(Counter(y_under))

In [None]:
train_pct = 0.8
# Split into train (80%) dev (10%) test (10%)
# Splitting up the sentences and targets
# Set seed guarantees sentences and targets shuffle in same order, split the data with the same class distribution
train_sents, dev_test_sents = train_test_split(X_under, train_size=train_pct, random_state=42,
                                          stratify=y_under, shuffle=True)

train_labels, dev_test_labels = train_test_split(y_under, train_size=train_pct, random_state=42,
                                            stratify=y_under, shuffle=True)

dev_sents, test_sents = train_test_split(dev_test_sents, train_size=0.5,random_state=42,
                                         stratify=dev_test_labels, shuffle=True)

dev_labels, test_labels = train_test_split(dev_test_labels, train_size=0.5, random_state=42,
                                           stratify=dev_test_labels, shuffle=True)
X_train = train_sents
y_train = train_labels

X_dev = dev_sents
y_dev = dev_labels

X_test = test_sents
y_test = test_labels
#
print("training shape X, y:", X_train.shape, y_train.shape)
print("dev shape X, y:", X_dev.shape, y_dev.shape)
print("test shape X, y:", X_test.shape, y_dev.shape)

In [None]:
y_train = y_train.tolist()
y_dev = y_dev.tolist()
y_test = y_test.tolist()

train_labels = [int(i) for i in y_train]
dev_labels = [int(i) for i in y_dev]
test_labels = [int(i) for i in y_test]

In [None]:
X_tr = list(itertools.chain.from_iterable(X_train))
X_d = list(itertools.chain.from_iterable(X_dev))
X_te = list(itertools.chain.from_iterable(X_test))

In [None]:
MODEL_NAME = 'roberta-base'

review = X_tr[0]

tkzr = RobertaTokenizer.from_pretrained(MODEL_NAME)

inputs = tkzr(review, truncation=True, padding=True)

print(f'review: \'{review}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

In [None]:
def construct_encodings(x, tkzr, trucation=True, padding=True):
    return tkzr(x, truncation=trucation, padding=padding)

train_encodings = construct_encodings(X_tr, tkzr)
dev_encodings = construct_encodings(X_d, tkzr)
test_encodings = construct_encodings(X_te, tkzr)

In [None]:
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

train_tfdataset = construct_tfdataset(train_encodings, train_labels)
dev_tfdataset = construct_tfdataset(dev_encodings, dev_labels)
test_tfdataset = construct_tfdataset(test_encodings, test_labels)


In [None]:
BATCH_SIZE = 64

tfdataset_train = train_tfdataset.batch(BATCH_SIZE)
tfdataset_dev = dev_tfdataset.batch(BATCH_SIZE)
tfdataset_test = test_tfdataset.batch(BATCH_SIZE)

In [None]:

N_EPOCHS = 5

model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=5e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)


In [None]:
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


In [None]:
logits = model.predict(tfdataset_test)

In [None]:
preds = np.argmax(logits[0], axis=1)

In [None]:
preds

In [None]:
true = np.asarray(y_test)

In [None]:
confusion_matrix(true, preds, labels=[1,0])

In [None]:
true

In [None]:
np.where(preds!=true)

In [None]:
np.argsort(preds)

In [None]:
test_index = 217

print(X_test[test_index])
print(true[test_index])
print(preds[test_index])