<a href="https://colab.research.google.com/github/jmachima/Archaeology_News/blob/main/Archaeology_News_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Jasmine Machima 
jasmine.machima@gmail.com

Text Classification with content from archaeology.org/news (with permission from the editor-in-chief of Archaeology - A publication of the Archaeological Institute of America)

In [1]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split
tf.get_logger().setLevel('ERROR')

In [2]:
tf.__version__

'2.8.2'

In [3]:
# TPU will be used for text classification with BERT (Bidirectional Encoder Representations from Transformers).

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [None]:
!pip install transformers

In [5]:
import pandas as pd
pd.options.display.max_rows = 999
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

In [6]:
#bert-base-cased
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', output_attention=True)
# 14 text categories for text classification

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from google.colab import files

# upload ARCHAIC_TrainAndValidate.csv for training and validation

#Upload 2 more files for test data sets
# ARCHAIC_test.csv and ARCHAIC_2labels.csv
files.upload()

In [8]:
data = pd.read_csv('ARCHAIC_TrainAndValidate.csv')

Class = set(data['category'])  # List of unique categories

CL = dict(zip(Class,list(range(14)))) # convert categories into numericals

rvCL = dict(zip(list(range(14)),Class)) # reverse lookup for numerical label using another dictionary

print(CL)

{'Bronze Age': 0, 'Iron Age': 1, 'South&Southeast Asia': 2, 'Medieval': 3, 'Near East': 4, 'Roman': 5, 'Modern': 6, 'Pre-Human': 7, 'Paleolithic': 8, 'East Asia': 9, 'Pre-Columbian': 10, 'Egyptian': 11, 'Native American': 12, 'Neolithic': 13}


In [9]:
# Function for assigning numerical category to news text
#Also check if a news piece has a sequence length longer than 384
def col_label(Table): 
 
  Table['Label'] = 0 # want integers only
  long=[]

  for i in Table.index:
      
      Table.at[i,'Label'] = CL[Table.at[i,'category']]
      tokens=bert_tokenizer.tokenize(Table.at[i,'clean text'])
      if len(tokens) > 384:
        print('i=',i, len(tokens))
        long.append([i,len(tokens)])
      else:
        pass
  return Table, long

# Assign input IDs and attention masks

def Encode(table):

  sentences=table['clean text']
  labels=table['Label']

  input_ids=[]
  attention_masks=[]

  for sent in sentences:
      bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =384,pad_to_max_length = True,return_attention_mask = True)
      input_ids.append(bert_inp['input_ids'])
      attention_masks.append(bert_inp['attention_mask'])

  input_ids=np.asarray(input_ids)
  attention_masks=np.array(attention_masks)
  labels=np.array(labels)
  return input_ids, attention_masks, labels


# Use argmax to get Label of predicted category (for later use)
# Reverse look-up for category names

def rev_label(pred, X,y_true):

  predicted =[]
  pred_cat = []
  true_cat = []
  for j in range(len(X)):
    index= np.argmax(pred[0][j])
    predicted.append(int(index))
    cat = rvCL[index]
    true = rvCL[y_true[j]]
    pred_cat.append(cat)
    true_cat.append(true)
  return predicted, pred_cat, true_cat

In [10]:
# Add a 'Label' column to the data table
data, over_384 = col_label(data)

i= 819 401


In [11]:
# IDs and masks for news texts

input_ids, attention_masks, labels = Encode(data)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
# Split input IDs, labels, and masks for training and validation sets

train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.25, random_state=857)
print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (710, 384) Val input shape (237, 384)
Train label shape (710,) Val label shape (237,)
Train attention mask shape (710, 384) Val attention mask shape (237, 384)


In [13]:
# Training and validation
# With early stopping when criterion is met

with strategy.scope():

  Loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  Metric = tf.keras.metrics.SparseCategoricalAccuracy()
  callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
  Optimizer = tf.keras.optimizers.Adam(learning_rate=0.0000091,epsilon=5e-10)
  bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
  bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=14)
  bert_model.compile(loss=Loss, metrics=Metric,optimizer=Optimizer)
  history=bert_model.fit([train_inp,train_mask],train_label,batch_size=64,epochs=100,validation_data=([val_inp,val_mask],val_label),callbacks=[callback])

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


In [14]:
# Split texts (raw data) into train and validation sets so that validation set can be examined.

X_tr, X_ts, Y_tr, Y_ts=train_test_split(data['clean text'],data['Label'],test_size=0.25, random_state=857)

# Make a table for the validation set alone
val_set = pd.DataFrame(X_ts)
val_set['Label'] = Y_ts
y_true = list(Y_ts)

In [15]:
#Obtain predicted categories from the trained BERT model

pred = bert_model([val_inp,val_mask])

In [16]:
# predicted for predicted label
# pred_cat for predicted category name
# true_cat for actual category
predicted, pred_cat, true_cat = rev_label(pred, X_ts, y_true)

val_set['predicted'] = predicted
val_set['true_cat'] = true_cat
val_set['pred_cat'] = pred_cat

In [17]:
print('Incorrectly predicted news pieces in total:',len(val_set[val_set['Label']!=val_set['predicted']]))
val_set[val_set['Label']!=val_set['predicted']]

Incorrectly predicted news pieces in total: 23


Unnamed: 0,clean text,Label,predicted,true_cat,pred_cat
232,"Intact Coffins Discovered in Saqqara. CAIRO, E...",11,4,Egyptian,Near East
566,Mithraeum Excavated in Rome’s Port of Ostia. B...,5,11,Roman,Egyptian
827,New Thoughts on the Causes of Angkor’s Decline...,2,3,South&Southeast Asia,Medieval
934,Traces of Nicotine Detected in Ancient Dental ...,12,10,Native American,Pre-Columbian
349,New Thoughts on Turkey’s Oldest Temple Complex...,13,0,Neolithic,Bronze Age
685,Brass Balsamarium Discovered in Ancient Thrace...,5,1,Roman,Iron Age
290,Smuggling Attempt Foiled at Egypt’s Port of Al...,4,11,Near East,Egyptian
510,Skull Study Reveals Diversity Among First Nort...,12,8,Native American,Paleolithic
42,Floodwaters Threaten Sudan’s Ancient Pyramids....,11,0,Egyptian,Bronze Age
706,New Evidence for Hunter-Gatherer Trade in Nort...,12,8,Native American,Paleolithic


# Test Data and Evaluation

In [18]:
# Contains test data set which is somewhat more difficult than any validation set
# Test data set has only 1 label per news story
Easy_test = pd.read_csv('ARCHAIC_test.csv')

In [19]:
# A place to assure there are no duplicates from the training set in the test sets
DUP = []
arch_testing = list(Easy_test['url'])
for j in range(len(data['url'])):
  if data.at[j,'url'] in arch_testing:
    DUP.append(data.at[j,'url'])
    print(j)
  else:
    pass

#If there is anything in DUP, it has to be removed from the test data set.
DUP

[]

In [20]:
# Create a numerical label column for the Easy_test table
Easy_test, Easy_len=col_label(Easy_test)

# encode the texts in Easy_test
test_input_ids, test_attention_masks, test_labels = Encode(Easy_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [21]:
# Predicting news category on the test set
Easy_test_pred = bert_model([test_input_ids,test_attention_masks])

# Add predicted label and category columns to the Easy_test table
es_predicted, es_pred_cat, es_true_cat = rev_label(Easy_test_pred, Easy_test['clean text'], test_labels)

Easy_test['predicted'] = es_predicted
Easy_test['true_cat'] = es_true_cat
Easy_test['pred_cat'] = es_pred_cat

In [22]:
# Accuracy for the main test set (1 label)

ACCU = sklearn.metrics.accuracy_score(Easy_test['Label'], Easy_test['predicted'], normalize=True, sample_weight=None)
print('Easy_test Accuracy =',round((ACCU*100),2))

Easy_test Accuracy = 86.14


In [23]:
print('Wrongly predicted news pieces from test set:',len(Easy_test[Easy_test['Label']!=Easy_test['predicted']]))
Easy_test[Easy_test['Label']!=Easy_test['predicted']][['clean text','category','pred_cat']]

Wrongly predicted news pieces from test set: 131


Unnamed: 0,clean text,category,pred_cat
0,Nomadic Warriors’ Remains Unearthed in Croatia...,Medieval,Roman
1,"New Study Suggests Measles Virus is 2,500 Year...",Iron Age,East Asia
2,Canaanite Fortress Unearthed in Southern Israe...,Bronze Age,Egyptian
3,Ancient Hue Detected in Renaissance Painting. ...,Medieval,Roman
6,New Date Pinpoints Construction of Medieval Uy...,Medieval,Near East
26,Possible Traces of Antebellum Home Unearthed i...,Modern,Native American
27,Secret Passage Discovered in London’s House of...,Medieval,Modern
31,Remains of Fish Dinners May Reflect Climate Ch...,Neolithic,Paleolithic
34,Soviet Shortwave Spy Radio Unearthed in Wester...,Modern,Roman
75,16th-Century Manuscript Attributed to Queen El...,Medieval,Roman


There are news stories which cannot be assigned to a single category as the content shows an overlap of 2 categories or more. Each news piece in the test data set below has been given 2 labels only for simplicity even though some of them are relevant to 3 categories. The news stories below are considered much more difficult to classify than the single-label test data set.

In [24]:
# A different test data set with 2 classes assgined to each news story
Hard_test = pd.read_csv('ARCHAIC_2labels.csv')

# Assign the primary label first (not necessarily the dominant class)
Hard_test, Hard_over_384 = col_label(Hard_test)

In [25]:
# Set up the numerical label column for the 2nd category
Hard_test['2nd_Label'] = 0 # want integers only
pair = []

for i in Hard_test.index:
      
  Hard_test.at[i,'2nd_Label'] = CL[Hard_test.at[i,'2nd category']]
  b = Hard_test.at[i,'2nd_Label']
  a = Hard_test.at[i,'Label']
  twin = [a,b]
  pair.append(twin)

# Add a column with a list of two numerical labels for each news piece
Hard_test['twin label'] = pair

In [26]:
Hard_input_ids,Hard_attention_masks, Hard_labels = Encode(Hard_test)
Hard_test_pred = bert_model([Hard_input_ids,Hard_attention_masks])

# Collect predicted twin labels using the argsort function. The last two positions in the argsort results give max. and 2nd highest probability values
hd_predicted =[]

for l in range(len(Hard_test)):
  twin_index= (np.argsort(Hard_test_pred[0][l]))[-2 :]
  hd_predicted.append(twin_index)

Hard_test['predicted'] = hd_predicted  

# Custom scoring - both labels correct - 1, only one label correct - 0.5

score = 0

for j in Hard_test.index:
  if (Hard_test.at[j,'predicted'])[0] in Hard_test.at[j,'twin label']:
    score += 0.5
  else:
    pass
  if (Hard_test.at[j,'predicted'])[1] in Hard_test.at[j,'twin label']:
    score += 0.5
  else:
    pass

print('two-label accuracy =', round(score/(len(Hard_test))*100,2))



two-label accuracy = 70.09


In [27]:
# Rearrange the order of columns in the table for readability
Hard_test = Hard_test[['url','clean text','category','2nd category','Label','2nd_Label','twin label','predicted']]
Hard_test[['clean text','category','2nd category','Label','2nd_Label','twin label','predicted']]

Unnamed: 0,clean text,category,2nd category,Label,2nd_Label,twin label,predicted
0,"Finds Spanning 4,000 Years Unearthed in Southe...",Neolithic,Iron Age,13,1,"[13, 1]","[0, 1]"
1,Genome Study Yields New Date for Arrival of TB...,Medieval,Neolithic,3,13,"[3, 13]","[3, 13]"
2,Early Christian Chalice Unearthed in Northern ...,Roman,Medieval,5,3,"[5, 3]","[3, 5]"
3,Multidisciplinary Study Tracks Spread of Rice ...,East Asia,South&Southeast Asia,9,2,"[9, 2]","[9, 2]"
4,Possible Ancient Longboat Recovered in Ireland...,Neolithic,Medieval,13,3,"[13, 3]","[3, 13]"
5,Three Well-Preserved Ancient Boats Unearthed i...,Roman,Medieval,5,3,"[5, 3]","[1, 5]"
6,Leather Balls Found in Ancient Graves in North...,Bronze Age,East Asia,0,9,"[0, 9]","[13, 0]"
7,Volunteers Spot New Sites in Aerial Images of ...,Roman,Medieval,5,3,"[5, 3]","[7, 5]"
8,"Snake Altar Unearthed in Turkey. ANTALYA, TURK...",Roman,Iron Age,5,1,"[5, 1]","[2, 1]"
9,Study Examines Food and Gender in Bronze Age C...,Bronze Age,East Asia,0,9,"[0, 9]","[9, 0]"


In [28]:
# Dictionary reference
CL

{'Bronze Age': 0,
 'East Asia': 9,
 'Egyptian': 11,
 'Iron Age': 1,
 'Medieval': 3,
 'Modern': 6,
 'Native American': 12,
 'Near East': 4,
 'Neolithic': 13,
 'Paleolithic': 8,
 'Pre-Columbian': 10,
 'Pre-Human': 7,
 'Roman': 5,
 'South&Southeast Asia': 2}