### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [1]:
import csv

In [2]:
train, dev, test = [], [], []

In [3]:
with open('./data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [4]:
with open('./data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [5]:
with open('./data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


main code

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 6.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 78.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 73.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import RobertaTokenizer
from keras.callbacks import EarlyStopping
import sklearn

In [8]:
def get_train_test_data(train,dev,test):
    train_data = pd.DataFrame(train, columns=['sentence_1', 'sentence_2', 'class_label'])
    val_data = pd.DataFrame(dev, columns=['sentence_1', 'sentence_2', 'class_label'])
    test_data = pd.DataFrame(test, columns=['sentence_1', 'sentence_2'])
    return train_data,val_data,test_data

training_data,dev_data,test_data = get_train_test_data(train,dev,test)

In [9]:
training_data.shape

(5983, 3)

In [10]:
training_data.head(4)

Unnamed: 0,sentence_1,sentence_2,class_label
0,Sometimes do exercise.,A person typically desire healthy life.,1
1,Who eats junk foods.,A person typically desire healthy life.,0
2,A person is sick.,A person typically desire healthy life.,1
3,A person is dead.,A person typically desire healthy life.,0


In [11]:
training_data.class_label.value_counts()

1    3145
0    2838
Name: class_label, dtype: int64

In [12]:
dev_data.class_label.value_counts()

1    554
0    501
Name: class_label, dtype: int64

In [13]:
def prep_train_data(training_data,dev_data,nc):
    dev_y = tf.keras.utils.to_categorical(dev_data.class_label, num_classes=nc)
    train_y = tf.keras.utils.to_categorical(training_data.class_label, num_classes=nc)
    return train_y,dev_y

train_y,dev_y = prep_train_data(training_data,dev_data,2)

In [14]:
def set_hyperparameters():
    return 256,32,5,1e-5,0.0001
  
max_length,batch_size,epochs,lr,delta = set_hyperparameters()

In [16]:
class generate_semantics_data(tf.keras.utils.Sequence):
 
    def __init__(self,sentence_pairs,labels,batch_size=32,shuffle=True,include_targets=True,):
        self.batch_size = batch_size
        self.sp = sentence_pairs
        self.is_shuffle = shuffle
        self.targets_i = include_targets
        self.t__ = len(self.sp)
        self.indexes = np.arange(t)
        self.c_labels = labels
        self.tokenizer = get_tokenizer()
        self.new_epoch()
        self.int_str = "int32"
        
    def get_tokenizer(self):
        return RobertaTokenizer.from_pretrained("roberta-base")

    def get_indices(self,index):
        return self.indexes[index*self.batch_size:(index + 1)*self.batch_size]
    
    def get_encoder_items(self,sp):
        new_one=sp.tolist()
        return self.tokenizer.batch_encode_plus(new_one,add_special_tokens=True,max_length=max_length,return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )
    
    def new_epoch(self):
        if self.is_shuffle:
            np.random.RandomState(7777).shuffle(self.indexes)
            
    def __getitem__(self, index):
       
        indices = self.get_indices(index)
        sentence_pairs = self.sp[indices]

        new_enc = self.get_encoder_items(sentence_pairs)
        atm = np.array(new_enc["attention_mask"], dtype=self.int_str)
        tti = np.array(new_enc["token_type_ids"], dtype=self.int_str)
        id_ip = np.array(new_enc["input_ids"], dtype=self.int_str)
        
        if self.targets_i:
            c_l = np.array(self.c_labels[indexes], dtype=self.int_str)
            t_list = [id_ip, atm, tti]
            return t_list, c_l
        else:
            t_list=[id_ip, atm, tti]
            return t_list


stgy = tf.distribute.MirroredStrategy()

with stgy.scope():
    
    id_ip = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="id_ip")
   
    atm = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="atm")
    
    tti = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")

    robert_model = transformers.TFRobertaModel.from_pretrained("roberta-base")

    robert_model.trainable = True

    bert_output = robert_model(id_ip, attention_mask=atm, token_type_ids = tti)
    op_st = bert_output.last_hidden_state

    bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(op_st)
    mp = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    ap = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    d_size =0.35
    ct = tf.keras.layers.concatenate([ap, mp])
    dp = tf.keras.layers.dp(d_size)(ct)
    op = tf.keras.layers.Dense(2, activation="softmax")(dp)
    t_model = tf.keras.models.Model(inputs=[id_ip, atm, token_type_ids], outputs=op)
    
    t_model.compile(optimizer=tf.keras.optimizers.Adam(lr),loss="categorical_crossentropy",
        metrics=["acc"],
    )


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [19]:

t_dev_data = dev_data[["sentence_1", "sentence_2"]].values.astype("str")
t_training_data = training_data[["sentence_1", "sentence_2"]].values.astype("str")

dev_set_data = generate_semantics_data(t_dev_data,dev_y,batch_size=batch_size,shuffle=False,
)
train_set_data = generate_semantics_data(t_training_data,train_y,batch_size=batch_size,shuffle=True,
)

full_model = model.fit(train_set_data,validation_data=dev_set_data,epochs=5,verbose = 1,callbacks = [EarlyStopping(monitor='val_acc', patience=3, min_delta=delta)])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/5
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
def return_result(s1, s2):
    p,q = str(s1),str(s2)
    pair = np.array([[p,q]])
    bs=1
    test_data = generate_semantics_data(pair, labels=None, batch_size=bs, shuffle=False, include_targets=False,)
    prb = model.predict(test_data[0])
    return np.argmax(prb), prb[0]

In [51]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

for i in range(len(test_data)):
    t1,t2 = test_data.iat[i, 0],test_data.iat[i, 1]
    r = return_result(t1,t2)
    p1=str(r[0])
    results.append(p1)
  

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [52]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [53]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [54]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')