**NOTEBOOK USED TO FINE TUNE A BERT MODEL TO ASSIST WITH Q-A PAIR CLASSIFICATION AND REGRESSION.**

In [None]:
!pip install transformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')
project_folder = "./drive/My Drive/csc2515-project/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**PRE-PROCESSING**

In [None]:
import pandas as pd
import os
import numpy as np

# Load dataset
#questions_answers = pd.read_csv(os.path.join(project_folder, "Train.csv"))
questions_answers = pd.read_csv(os.path.join(project_folder, 'Train_RemovedNAN_AddedNormScore.csv'))

#questions_answers_test = questions_answers.sample(frac=0.2)

print(questions_answers.head(5).to_string())
print(questions_answers.shape)

   Unnamed: 0  Score  QId  Before  After  Cosine_to_Question  Cosine_to_Answers  Word_cnt  Char_cnt  Avg_char_length  Urls  Codes  Grade_Level  Dale_chall  Reading_Ease  Polarity  Subjectivity  Cumulative_ Answer_Score  Num_Answers  Num_Questions  Average_Answer_Score  Q_Word_Cnt  Q_Char_Cnt  Q_Avg_char_length  Q_Urls  Q_CodeSections  Q_GradeLevel  Q_Dale_chall  Q_ReadingEase  Q_Polarity  Q_Subjectivity                                                                                                                                                                                                             Clean_Question                                                                                                                                                                                                                                                                                                                                                                                       

In [None]:
questions_answers['Labels'] = questions_answers.groupby(['QId'])['Score'].transform(max) == questions_answers['Score']
questions_answers['Best_Score'] = questions_answers[['Labels']] * 1

In [None]:
# Drop the Nan's
questions_answers['Clean_Question'].replace('', np.nan, inplace=True)
questions_answers.dropna(subset=['Clean_Question'], inplace=True)

questions_answers['Clean_Answer'].replace('', np.nan, inplace=True)
questions_answers.dropna(subset=['Clean_Answer'], inplace=True)

print(questions_answers.shape)

# Drop duplicate best answers
questions_answers = questions_answers.groupby('QId').filter(lambda x: x.nlargest(2, 'Score')['Score'].iloc[1]!=x.nlargest(2, 'Score')['Score'].iloc[0])
questions_answers.reset_index(drop=True, inplace=True)

# Normalize scores
max = questions_answers.groupby('QId')['Score'].transform('max')
questions_answers['Normalized_Score'] = questions_answers['Score'].div(max)

(108705, 36)


In [None]:
print(questions_answers.head(5).to_string())

   Unnamed: 0  Score  QId  Before  After  Cosine_to_Question  Cosine_to_Answers  Word_cnt  Char_cnt  Avg_char_length  Urls  Codes  Grade_Level  Dale_chall  Reading_Ease  Polarity  Subjectivity  Cumulative_ Answer_Score  Num_Answers  Num_Questions  Average_Answer_Score  Q_Word_Cnt  Q_Char_Cnt  Q_Avg_char_length  Q_Urls  Q_CodeSections  Q_GradeLevel  Q_Dale_chall  Q_ReadingEase  Q_Polarity  Q_Subjectivity                                                                                                                                                                                                             Clean_Question                                                                                                                                                                                                                                                                                                                                                                                       

In [None]:
# Perform tokenization as BERT needs it in a specific format
def tokenize_q_and_a(data, type, score_var, seq_length=512):
  from transformers import BertTokenizer
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  questions_answers_input_ids = []
  questions_answers_attention_masks = []
  questions_answers_token_ids = []
  
  scores_list = []

  for (idx, row) in data.iterrows():
      tokenize = tokenizer.encode_plus(text=row.Clean_Question, text_pair=row.Clean_Answer, max_length=seq_length, 
                                       padding='max_length', truncation=True,
                                       return_token_type_ids=True, return_attention_mask=True) 

      questions_answers_input_ids.append(tokenize['input_ids'])
      questions_answers_attention_masks.append(tokenize['attention_mask'])
      questions_answers_token_ids.append(tokenize['token_type_ids'])

      scores_list.append(row[score_var])

  # TFBertForSequenceClassification requires Numpy arrays
  questions_answers_input_ids = np.asarray(questions_answers_input_ids)
  questions_answers_attention_masks = np.asarray(questions_answers_attention_masks)
  questions_answers_token_ids = np.asarray(questions_answers_token_ids)

  scores_list = np.array(scores_list)

  return questions_answers_input_ids, questions_answers_attention_masks, questions_answers_token_ids, scores_list

In [None]:
questions_answers['Clean_Answer'] = questions_answers['Clean_Answer'].astype(str)
questions_answers['Clean_Question'] = questions_answers['Clean_Question'].astype(str)
print(questions_answers['Clean_Question'][222])

what is bodmas and why is it useful in programming


In [None]:
questions_answers['Clean_Answer'] = questions_answers['Clean_Answer'].str.split()
questions_answers['Clean_Question'] = questions_answers['Clean_Question'].str.split()

In [None]:
print(questions_answers['Clean_Question'][51])

['im', 'looking', 'for', 'a', 'way', 'to', 'delete', 'a', 'file', 'which', 'is', 'locked', 'by', 'another', 'process', 'using', 'c', 'i', 'suspect', 'the', 'method', 'must', 'be', 'able', 'to', 'find', 'which', 'process', 'is', 'locking', 'the', 'file', 'perhaps', 'by', 'tracking', 'the', 'handles', 'although', 'im', 'not', 'sure', 'how', 'to', 'do', 'this', 'in', 'c', 'then', 'close', 'that', 'process', 'before', 'being', 'able', 'to', 'complete', 'the', 'file', 'delete', 'using']


In [None]:
SEQ_LEN = 256

questions_answers['Clean_Answer'].head(50)
for idx, row in questions_answers.iterrows():
  questions_answers.at[idx, 'Clean_Answer'] = row.Clean_Answer[0:SEQ_LEN]
  questions_answers.at[idx, 'Clean_Question'] = row.Clean_Question[0:SEQ_LEN]
questions_answers['Clean_Answer'].head(50)

0     [ive, read, somewhere, the, human, eye, cant, ...
1     [isnt, it, also, a, factor, which, order, you,...
2     [my, first, thought, on, this, is, how, genera...
3     [it, would, be, best, to, find, colors, maxima...
4     [some, related, resources, colorbrewer, sets, ...
5     [here, is, some, code, to, allocate, rgb, colo...
6     [last, i, checked, jfreechart, has, this, prec...
7     [i, know, this, an, old, post, but, i, found, ...
8     [to, achieve, most, distinguishable, we, need,...
9     [yes, i, thought, about, that, but, i, soon, f...
10    [oleg, shilos, c, script, solution, at, the, c...
11    [you, might, be, able, to, use, ironruby, for,...
12    [you, could, use, any, of, the, dlr, languages...
13    [if, you, dont, want, to, use, the, dlr, you, ...
14    [the, main, application, that, my, division, s...
15    [id, suggest, using, luainterface, as, it, has...
16    [the, next, version, of, net, 50, has, had, a,...
17    [im, using, luainterface13, lua, 50, for, 

In [None]:
"print(questions_answers['Clean_Answer'][2])
print(questions_answers['Clean_Answer'][5])

['my', 'first', 'thought', 'on', 'this', 'is', 'how', 'generate', 'n', 'vectors', 'in', 'a', 'space', 'that', 'maximize', 'distance', 'from', 'each', 'other', 'you', 'can', 'see', 'that', 'the', 'rgb', 'or', 'any', 'other', 'scale', 'you', 'use', 'that', 'forms', 'a', 'basis', 'in', 'color', 'space', 'are', 'just', 'vectors', 'take', 'a', 'look', 'at', 'random', 'point', 'picking', 'hope', 'this', 'is', 'a', 'g']
['here', 'is', 'some', 'code', 'to', 'allocate', 'rgb', 'colors', 'evenly', 'around', 'a', 'hsl', 'color', 'wheel', 'of', 'specified', 'luminosity']


In [None]:
'''print(questions_answers['CleanAnswer'].head(5).to_string())
print(questions_answers['CleanQuestion'].head(5).to_string())'''

0    ['ive', 'read', 'somewhere', 'the', 'human', '...
1    ['isnt', 'it', 'also', 'a', 'factor', 'which',...
2    ['my', 'first', 'thought', 'on', 'this', 'is',...
3    ['it', 'would', 'be', 'best', 'to', 'find', 'c...
4    ['some', 'related', 'resources', 'colorbrewer'...
0    ['this', 'is', 'something', 'ive', 'pseudosolv...
1    ['this', 'is', 'something', 'ive', 'pseudosolv...
2    ['this', 'is', 'something', 'ive', 'pseudosolv...
3    ['this', 'is', 'something', 'ive', 'pseudosolv...
4    ['this', 'is', 'something', 'ive', 'pseudosolv...


**SPLIT THE DATA SET INTO TEST AND TRAIN AND THEN TOKENIZE BOTH**

In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Find the split index, then iterate to next question since the the split should not be in the middle of a question
split_index = int(len(questions_answers) * 0.8)
split_Q = questions_answers['QId'].iloc[split_index]
while(split_index<len(questions_answers) and questions_answers['QId'].iloc[split_index] == split_Q):
  split_index += 1
questions_answers_train = questions_answers[:split_index]
questions_answers_test = questions_answers[split_index:]

In [None]:
qa_input_ids, qa_attention_masks, qa_token_ids, scores = tokenize_q_and_a(questions_answers_train, type='train', score_var='Best_Score')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




KeyboardInterrupt: ignored

In [None]:
print(questions_answers_train.shape)
print(questions_answers_test.shape)

In [None]:
'''# SPLIT TEST DATA INTO VAL AND TEST
# Find the split index, then iterate to next question since the the split should not be in the middle of a question
split_index = int(len(questions_answers_test) * 0.5)
split_Q = questions_answers_test['QId'].iloc[split_index]
while(split_index<len(questions_answers_test) and questions_answers_test['QId'].iloc[split_index] == split_Q):
  split_index += 1
questions_answers_val = questions_answers_test[:split_index]
#Ytr = normalized_scores[:split_index]
questions_answers_test_actual = questions_answers_test[split_index:]'''

In [None]:
print(questions_answers_val.shape)
print(questions_answers_test_actual.shape)

NameError: ignored

In [None]:
val_qa_input_ids, val_qa_masks, val_qa_token_ids, val_scores = tokenize_q_and_a(questions_answers_test, type='test', score_var='Best_Score')

In [None]:
print(qa_input_ids.shape)
print(scores.shape)
print(scores[1])

print(val_qa_input_ids.shape)
print(val_scores.shape)
print(val_scores[100])

In [None]:
# Save the tokens for future use
import pickle
import os

pickle_qa_inp_path = os.path.join(project_folder, "BERT/tokenized/bert_inp_q.pkl")
pickle_qa_mask_path = os.path.join(project_folder, "BERT/tokenized/bert_mask_q.pkl")
pickle_qa_token_ids = os.path.join(project_folder, "BERT/tokenized/bert_qa_token_ids.pkl")
pickle_score_path = os.path.join(project_folder, "BERT/tokenized/bert_scores.pkl")

pickle.dump((qa_input_ids),open(pickle_qa_inp_path,'wb'))
pickle.dump((qa_attention_masks),open(pickle_qa_mask_path,'wb'))
pickle.dump((qa_token_ids),open(pickle_qa_token_ids,'wb'))
pickle.dump((scores),open(pickle_score_path,'wb'))

pickle_qa_inp_path = os.path.join(project_folder, "BERT/tokenized/bert_inp_q_val.pkl")
pickle_qa_mask_path = os.path.join(project_folder, "BERT/tokenized/bert_mask_q_val.pkl")
pickle_qa_token_ids = os.path.join(project_folder, "BERT/tokenized/bert_qa_token_ids_val.pkl")
pickle_score_path = os.path.join(project_folder, "BERT/tokenized/bert_scores_val.pkl")

pickle.dump((val_qa_input_ids),open(pickle_qa_inp_path,'wb'))
pickle.dump((val_qa_masks),open(pickle_qa_mask_path,'wb'))
pickle.dump((val_qa_token_ids),open(pickle_qa_token_ids,'wb'))
pickle.dump((val_scores),open(pickle_score_path,'wb'))

FileNotFoundError: ignored

In [None]:
# Load pickle files that are already saved
import pickle
import os

print('Preparing the pickle file.....')
pickle_qa_inp_path = os.path.join(project_folder, "BERT/tokenized/bert_inp_q.pkl")
pickle_qa_mask_path = os.path.join(project_folder, "BERT/tokenized/bert_mask_q.pkl")
pickle_qa_token_ids = os.path.join(project_folder, "BERT/tokenized/bert_qa_token_ids.pkl")
pickle_score_path = os.path.join(project_folder, "BERT/tokenized/bert_scores.pkl")

print('Loading the saved pickle files..')

qa_input_ids = pickle.load(open(pickle_qa_inp_path, 'rb'))
qa_attention_masks = pickle.load(open(pickle_qa_mask_path, 'rb'))
qa_token_ids = pickle.load(open(pickle_qa_token_ids, 'rb'))
scores = pickle.load(open(pickle_score_path, 'rb'))

pickle_qa_inp_path = os.path.join(project_folder, "BERT/tokenized/bert_inp_q_val.pkl")
pickle_qa_mask_path = os.path.join(project_folder, "BERT/tokenized/bert_mask_q_val.pkl")
pickle_qa_token_ids = os.path.join(project_folder, "BERT/tokenized/bert_qa_token_ids_val.pkl")
pickle_score_path = os.path.join(project_folder, "BERT/tokenized/bert_scores_val.pkl")

val_qa_input_ids = pickle.load(open(pickle_qa_inp_path, 'rb'))
val_qa_masks = pickle.load(open(pickle_qa_mask_path, 'rb'))
val_qa_token_ids = pickle.load(open(pickle_qa_token_ids, 'rb'))
val_scores = pickle.load(open(pickle_score_path, 'rb'))

Preparing the pickle file.....
Loading the saved pickle files..


In [None]:
val_qa_input_ids.shape

(21740, 512)

In [None]:
val_scores

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
from sklearn.model_selection import train_test_split

'''# Find the split index, then iterate to next question since the the split should not be in the middle of a question
split_index = int(len(questions_answers) * 0.8)
split_Q = questions_answers['QId'].iloc[split_index]
while(split_index<len(questions_answers) and questions_answers['QId'].iloc[split_index] == split_Q):
  split_index += 1
questions_answers_train = questions_answers[:split_index]
#Ytr = normalized_scores[:split_index]
questions_answers_test = questions_answers[split_index:]

#train_qatrain_qa_input_ids, val_qa_input_ids, train_qa_masks, val_qa_masks, train_qa_token_ids, val_qa_token_ids, train_scores, val_scores = train_test_split(qa_input_ids, qa_attention_masks, qa_token_ids, scores, test_size=0.2, random_state=42)

print(train_qa_input_ids.shape, val_qa_input_ids.shape, train_qa_masks.shape, val_qa_masks.shape, 
      train_qa_token_ids.shape, val_qa_token_ids.shape, train_scores.shape, val_scores.shape)'''

"# Find the split index, then iterate to next question since the the split should not be in the middle of a question\nsplit_index = int(len(questions_answers) * 0.8)\nsplit_Q = questions_answers['QId'].iloc[split_index]\nwhile(split_index<len(questions_answers) and questions_answers['QId'].iloc[split_index] == split_Q):\n  split_index += 1\nquestions_answers_train = questions_answers[:split_index]\n#Ytr = normalized_scores[:split_index]\nquestions_answers_test = questions_answers[split_index:]\n\n#train_qatrain_qa_input_ids, val_qa_input_ids, train_qa_masks, val_qa_masks, train_qa_token_ids, val_qa_token_ids, train_scores, val_scores = train_test_split(qa_input_ids, qa_attention_masks, qa_token_ids, scores, test_size=0.2, random_state=42)\n\nprint(train_qa_input_ids.shape, val_qa_input_ids.shape, train_qa_masks.shape, val_qa_masks.shape, \n      train_qa_token_ids.shape, val_qa_token_ids.shape, train_scores.shape, val_scores.shape)"

**SET UP TPUS FOR TRAINING**

In [None]:
# USE TPUS
import tensorflow as tf
# Get a handle to the attached TPU. On GCP it will be the CloudTPU itself
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
# Connect to the TPU handle and initialise it
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

strategy = tf.distribute.experimental.TPUStrategy(resolver)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.57.97.34:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.57.97.34:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [None]:
'''# NO TPUS
# Train model
from transformers import BertConfig, TFBertForSequenceClassification

# Load Model
import tensorflow as tf
import keras

#print(tensorflow.__version__)
from transformers import BertConfig
model_config = BertConfig.from_pretrained('bert-large-uncased')
model_config.num_labels = 1

from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained('bert-large-uncased', config=model_config)

loss = tf.keras.losses.MeanSquaredError()
#loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
metric = tf.keras.metrics.MeanSquaredError('mse')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
print(model.summary())

#model_path = os.path.join(project_folder, 'BERT/model/model.h5')
model_path = os.path.join(project_folder, 'BERT/model/model{epoch:08d}.h5')

checkpoint = keras.callbacks.ModelCheckpoint(filepath=model_path, 
                                             monitor='val_loss', save_best_only=False, save_weights_only=False, model='auto')'''

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['dropout_221', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  335141888 
_________________________________________________________________
dropout_221 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1025      
Total params: 335,142,913
Trainable params: 335,142,913
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
num_q = questions_answers.groupby('QId')
print(num_q.ngroups)

In [None]:
# Class weighting for imbalanced dataset
total = questions_answers_train.shape[0]
pos = np.count_nonzero(questions_answers_train['Best_Score'])
neg = total - pos
weight_for_0 = (1 / neg) * (total) / 2.0 
weight_for_1 = (1 / pos) * (total) / 2.0
print(weight_for_0, weight_for_1)

0.5596203346203346 4.693200215866163


In [None]:
import numpy as np
np.count_nonzero(questions_answers_test['Best_Score'])

2628

In [None]:
# TPUS
# Train model
from transformers import BertConfig, TFBertForSequenceClassification

# Load Model
import tensorflow as tf
import keras

from transformers import BertConfig
model_config = BertConfig.from_pretrained('bert-base-uncased')
model_config.num_labels = 2

class_weight = {0: weight_for_0, 1: weight_for_1}

from transformers import TFBertForSequenceClassification

with strategy.scope():
  model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=model_config)

  #loss = tf.keras.losses.MeanSquaredError()
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  #metric = tf.keras.metrics.MeanSquaredError('mse')
  optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
  model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
  print(model.summary())

  #model_path = os.path.join(project_folder, 'BERT/model/model.h5')
  model_path = os.path.join(project_folder, 'BERT/model/model{epoch:08d}.h5')

  checkpoint = keras.callbacks.ModelCheckpoint(filepath=model_path, 
                                              monitor='val_loss', save_best_only=False, save_weights_only=False, model='auto')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier', 'dropout_299']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_299 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# For weighted classification
model.fit([qa_input_ids, qa_attention_masks, qa_token_ids], scores, batch_size=32, epochs=8, validation_data=([val_qa_input_ids, val_qa_masks, val_qa_token_ids], val_scores), callbacks=checkpoint, class_weight=class_weight)

# For regression
#model.fit([qa_input_ids, qa_attention_masks, qa_token_ids], scores, batch_size=32, epochs=12, validation_data=([val_qa_input_ids, val_qa_masks, val_qa_token_ids], val_scores), callbacks=checkpoint)

Epoch 1/8








Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f25d11353c8>

**RELOAD THE MODEL AND USE IT TO MAKE PREDICTIONS**

In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Load the pre-trained bert model (bert-large-uncased) and load our trained weights into it
reconstructed_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
reconstructed_model.load_weights(os.path.join(project_folder, 'BERT/model/model00000002.h5'))

#loss = tf.keras.losses.MeanSquaredError()
loss = tf.keras.losses.CategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
reconstructed_model.compile(loss=loss, optimizer=optimizer, metrics=metric)

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['dropout_147', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: ignored

In [None]:
index = val_qa_input_ids.shape[0]

In [None]:
#index = 1
prediction = reconstructed_model.predict([val_qa_input_ids[0:index], val_qa_masks[0:index], val_qa_token_ids[0:index]], batch_size=32)
prediction

TFSequenceClassifierOutput([('logits', array([[-0.01117945, -0.05750223],
                                    [-0.01117945, -0.05750221],
                                    [-0.01117948, -0.05750223],
                                    ...,
                                    [-0.01117939, -0.05750224],
                                    [-0.01117936, -0.05750217],
                                    [-0.01117945, -0.05750217]], dtype=float32))])

In [None]:
prediction[0]

array([[-0.01117945, -0.05750223],
       [-0.01117945, -0.05750221],
       [-0.01117948, -0.05750223],
       ...,
       [-0.01117939, -0.05750224],
       [-0.01117936, -0.05750217],
       [-0.01117945, -0.05750217]], dtype=float32)

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

val_labels = tf.nn.softmax(prediction[0])
val_labels = np.argmax(val_labels, axis=1)

# Classification scores 
print(classification_report(val_scores, val_labels))

# MSE
print('Mean squared error: %.2f'
    % mean_squared_error(val_scores, val_labels))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94     19112
           1       0.00      0.00      0.00      2628

    accuracy                           0.88     21740
   macro avg       0.44      0.50      0.47     21740
weighted avg       0.77      0.88      0.82     21740

Mean squared error: 0.12


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
val_scores[300]

1.0

In [None]:
val_prediction = reconstructed_model.predict([val_qa_input_ids, val_qa_masks, val_qa_token_ids])

KeyboardInterrupt: ignored

In [None]:
predictions = np.array(prediction[0]).ravel()
print(predictions)

[0.2916161  0.15799025 0.29506275 ... 0.3297501  0.09933956 0.0955681 ]


In [None]:
prediction_list = [] 

for i in range(0, len(predictions)):
  prediction_list.append(predictions[i])
print(prediction_list)

questions_answers_val['Predictions'] = prediction_list

[0.2916161, 0.15799025, 0.29506275, 0.3235647, 0.13902535, 0.2162099, 0.35359898, 0.16189384, 0.10476083, 0.25833833, 0.12292255, 0.2862709, 0.16633587, 0.2254946, 0.106374405, 0.16236734, 0.17180814, 0.19420782, 0.15354423, 0.09958378, 0.16648686, 0.2473915, 0.20841219, 0.22221386, 0.20169626, 0.19232483, 0.16299602, 0.16809244, 0.21767482, 0.13204645, 0.1101537, 0.41564563, 0.10786306, 0.21095783, 0.09039947, 0.2570459, 0.101310566, 0.12543263, 0.21234344, 0.08117717, 0.08910553, 0.20478363, 0.09685617, 0.20956346, 0.10444516, 0.31758034, 0.30665395, 0.23441438, 0.12798373, 0.2492349, 0.12750396, 0.105990924, 0.29136282, 0.15951663, 0.19989866, 0.1918755, 0.31650972, 0.16485947, 0.07953888, 0.28101358, 0.11165105, 0.1707446, 0.26458803, 0.22658858, 0.2279229, 0.09353813, 0.16597362, 0.24395792, 0.24369173, 0.25812474, 0.26350263, 0.12133454, 0.14513993, 0.19882469, 0.13674147, 0.14098136, 0.39628324, 0.20411639, 0.11114484, 0.21134429, 0.09540628, 0.31373748, 0.15049472, 0.13184178, 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [None]:
#Create a dataframe with QId and Predictions to determine the best answer prediction for each question
#d={'Prediction':np.transpose(predictions)[27]}
pred_df = questions_answers_val[['QId', 'Best_Score', 'Predictions']].copy()
#pred_df = pred_df.reset_index()
print(pred_df.head(100).to_string())

           QId  Best_Score  Predictions
66990  4880290           0     0.291616
66991  4880290           0     0.157990
66992  4880290           0     0.295063
66993  4880290           0     0.323565
66994  4880290           0     0.139025
66995  4880290           1     0.216210
66996  4880290           0     0.353599
66997  4880290           0     0.161894
66998  4880290           0     0.104761
66999  4881930           0     0.258338
67000  4881930           0     0.122923
67001  4881930           1     0.286271
67002  4881930           0     0.166336
67003  4881930           0     0.225495
67004  4881930           0     0.106374
67005  4881930           0     0.162367
67006  4881930           0     0.171808
67007  4881930           0     0.194208
67008  4881930           0     0.153544
67009  4881930           0     0.099584
67010  4884120           0     0.166487
67011  4884120           0     0.247392
67012  4884120           1     0.208412
67013  4884120           0     0.222214


'pred_df[\'QId\']=data[\'QId\'].tolist()[split_index:]\n#Get the index of the first best score prediction for each question\nbest_answers_pred=(pred_df.groupby(["QId"])["Prediction"].idxmax()).tolist()\ntest_data=data.iloc[split_index:]\ntest_data.reset_index(drop=True, inplace=True)\n#For every question in the test set get the index of the best answer\nbest_answers_test=(test_data.groupby(["QId"])["Score"].idxmax()).tolist()'

In [None]:
qid_group = pred_df.groupby('QId')

max_indices = []
for name, group in qid_group: 
  #if group['Best_Score'] == 1:
    #print("YAY")
  max_indices.append(group['Predictions'].idxmax())

print(max_indices)
correct = 0
total = len(max_indices)
for idx, row in pred_df.iterrows():
  if idx in max_indices:
    if row.Best_Score == 1:
      correct += 1

accuracy = correct / total
print(accuracy)

[66996, 67001, 67011, 67021, 67025, 67035, 67046, 67049, 67060, 67066, 67074, 67100, 67105, 67115, 67120, 67130, 67135, 67154, 67161, 67173, 67179, 67185, 67196, 67201, 67212, 67214, 67224, 67227, 67237, 67249, 67256, 67263, 67272, 67279, 67284, 67295, 67296, 67304, 67317, 67322, 67335, 67347, 67351, 67361, 67375, 67380, 67383, 67400, 67407, 67420, 67430, 67435, 67442, 67450, 67452, 67467, 67469, 67482, 67494, 67500, 67507, 67517, 67530, 67531, 67542, 67554, 67561, 67565, 67571, 67583, 67595, 67611, 67621, 67629, 67644, 67653, 67654, 67663, 67673, 67686, 67694, 67699, 67708, 67710, 67719, 67735, 67738, 67749, 67753, 67761, 67770, 67791, 67803, 67808, 67817, 67826, 67827, 67839, 67843, 67856, 67860, 67870, 67879, 67881, 67897, 67909, 67911, 67922, 67927, 67939, 67948, 67953, 67962, 67970, 67979, 67986, 67997, 68013, 68025, 68033, 68037, 68046, 68052, 68067, 68073, 68083, 68086, 68092, 68100, 68110, 68118, 68125, 68137, 68146, 68151, 68164, 68169, 68179, 68189, 68198, 68207, 68209, 68226

"idx = pred_df.groupby('QId')['Predictions'].transform(max) == pred_df['Predictions']\npred_df[idx]"

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

# FOR CLASSIFICATION APPROACH ONLY
# f1 = f1_score(test_y, pred_labels)
#print(classification_report(val_scores, test_labels))

# FOR REGRESSION APPROACH ONLY
# MSE
print('Mean squared error: %.2f'
    % mean_squared_error(questions_answers_test['Normalized Score'], test_prediction[0]))

Mean squared error: 0.13


In [None]:
questions_answers_test['Predicted Score'] = test_prediction[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
