Text Classification with content from archaeology.org/news (with permission from the editor-in-chief of Archaeology - A publication of the Archaeological Institute of America)

In [1]:
import tensorflow as tf
import os

In [None]:
# TPU will be used for text classification with BERT (Bidirectional Encoder Representations from Transformers).

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
!pip install -q -U tensorflow-text
!pip install -q -U tf-models-official

In [None]:
pip install transformers

In [5]:
import tensorflow_hub as hub
import tensorflow_text as text  # A dependency of the preprocessing model
#import tensorflow_addons as tfa
import numpy as np

tf.get_logger().setLevel('ERROR')

In [6]:
import pandas as pd
pd.options.display.max_rows = 999
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig

In [None]:
#bert-base-cased
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', output_attention=True)
# 14 text categories for text classification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-cased',output_attentions=True, num_labels=14)

In [8]:
from google.colab import files
files.upload()

Saving ARCHAIC_TrainAndValidate.csv to ARCHAIC_TrainAndValidate.csv


{'ARCHAIC_TrainAndValidate.csv': b'url,category,clean text\r\nhttps://www.archaeology.org/news/8830-200630-maya-guatemala-water,Pre-Columbian,"Scientists Analyze Tikal\xe2\x80\x99s Polluted Waters. CINCINNATI, OHIO\xe2\x80\x94According to a statement released by the University of Cincinnati , a team of scientists analyzed layers of sediments collected from ten reservoirs in the Maya city of Tikal, which is located in northern Guatemala. Founded in the third century B.C., the city was abandoned by the ninth century A.D., when a series of droughts occurred. The researchers detected toxic levels of cyanobacteria in the water near Tikal\xe2\x80\x99s central temple. Biologist David Lentz said consuming this water, even if it had been boiled, would have made people ill, especially during droughts. Algae blooms would have made the water look and taste bad anyway, added anthropologist Kenneth Tankersley. Mercury in the reservoir near the city\xe2\x80\x99s palace was traced to cinnabar, a red p

In [9]:
data = pd.read_csv('ARCHAIC_TrainAndValidate.csv')

Class = set(data['category'])  # List of unique categories

CL = dict(zip(Class,list(range(14)))) # convert categories into numericals

rvCL = dict(zip(list(range(14)),Class)) # reverse lookup for numerical label using another dictionary

print(CL)

{'Iron Age': 0, 'East Asia': 1, 'South&Southeast Asia': 2, 'Pre-Human': 3, 'Pre-Columbian': 4, 'Egyptian': 5, 'Medieval': 6, 'Paleolithic': 7, 'Modern': 8, 'Neolithic': 9, 'Native American': 10, 'Near East': 11, 'Roman': 12, 'Bronze Age': 13}


In [10]:
# Function for assigning numerical category to news text
#Also check if a news piece has a sequence length longer than 384
def col_label(Table): 
 
  Table['Label'] = 0 # want integers only
  long=[]

  for i in Table.index:
      
      Table.at[i,'Label'] = CL[Table.at[i,'category']]
      tokens=bert_tokenizer.tokenize(Table.at[i,'clean text'])
      if len(tokens) > 384:
        print('i=',i, len(tokens))
        long.append([i,len(tokens)])
      else:
        pass
  return Table, long



In [11]:
# Add a 'Label' column to the data table
data, over_384 = col_label(data)

i= 819 401


In [12]:
# Assign input IDs and attention masks

def Encode(table):

  sentences=table['clean text']
  labels=table['Label']

  input_ids=[]
  attention_masks=[]

  for sent in sentences:
      bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =384,pad_to_max_length = True,return_attention_mask = True)
      input_ids.append(bert_inp['input_ids'])
      attention_masks.append(bert_inp['attention_mask'])

  input_ids=np.asarray(input_ids)
  attention_masks=np.array(attention_masks)
  labels=np.array(labels)
  return input_ids, attention_masks, labels

In [13]:
# IDs and masks for news texts

input_ids, attention_masks, labels = Encode(data)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# Split input IDs, labels, and masks for training and validation sets

train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.25, random_state=94)
print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (708, 384) Val input shape (237, 384)
Train label shape (708,) Val label shape (237,)
Train attention mask shape (708, 384) Val attention mask shape (237, 384)


In [88]:
# Training and validation
# With early stopping when criterion is met

with strategy.scope():

  Loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  Metric = tf.keras.metrics.SparseCategoricalAccuracy()
  callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
  Optimizer = tf.keras.optimizers.Adam(learning_rate=0.0000085,epsilon=1e-08)
  bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
  bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=14)
  bert_model.compile(loss=Loss, metrics=Metric,optimizer=Optimizer)
  history=bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=50,validation_data=([val_inp,val_mask],val_label),callbacks=[callback])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50


In [25]:
# Split texts (raw data) into train and validation sets so that validation set can be examined.

X_tr, X_ts, Y_tr, Y_ts=train_test_split(data['clean text'],data['Label'],test_size=0.25, random_state=94)

# Make a table for the validation set alone
val_set = pd.DataFrame(X_ts)
val_set['Label'] = Y_ts
y_true = list(Y_ts)


In [97]:
#Obtain predicted categories from the trained BERT model

pred = bert_model([val_inp,val_mask])

In [27]:
# Use argmax to get Label of predicted category
# Reverse look-up for category names

def rev_label(pred, X,y_true):

  predicted =[]
  pred_cat = []
  true_cat = []
  for j in range(len(X)):
    index= np.argmax(pred[0][j])
    predicted.append(int(index))
    cat = rvCL[index]
    true = rvCL[y_true[j]]
    pred_cat.append(cat)
    true_cat.append(true)
  return predicted, pred_cat, true_cat



In [98]:
# predicted for predicted label
# pred_cat for predicted category name
# true_cat for actual category
predicted, pred_cat, true_cat = rev_label(pred, X_ts, y_true)

val_set['predicted'] = predicted
val_set['true_cat'] = true_cat
val_set['pred_cat'] = pred_cat

In [99]:
print('Incorrectly predicted news pieces in total:',len(val_set[val_set['Label']!=val_set['predicted']]))
val_set[val_set['Label']!=val_set['predicted']]

Incorrectly predicted news pieces in total: 16


Unnamed: 0,clean text,Label,predicted,true_cat,pred_cat
622,"1,000-Year-Old Settlement Mapped in Florida. G...",10,9,Native American,Neolithic
3,"New Thoughts on Phoenician Figurines. HAIFA, I...",0,11,Iron Age,Near East
151,New Dates Push Back Arrival of Modern Humans i...,7,3,Paleolithic,Pre-Human
669,Greek Pottery Used to Track Ancient Migration....,13,0,Bronze Age,Iron Age
173,Artifacts Unearthed at Estate Site in Scotland...,8,6,Modern,Medieval
501,Study Explores the Origins of Pottery in Asia....,7,9,Paleolithic,Neolithic
586,Puerto Rico’s Possible Ancient Cooking Techniq...,4,9,Pre-Columbian,Neolithic
324,Prehistoric Fishing Artifacts Reexamined in No...,9,7,Neolithic,Paleolithic
943,"4,000-Year-Old Game Board Identified in Azerba...",13,5,Bronze Age,Egyptian
675,New Survey Will Identify Nabataean Sites in Sa...,11,0,Near East,Iron Age


In [32]:
#Upload 2 more files for test data sets
files.upload()

Saving ARCHAIC_2labels.csv to ARCHAIC_2labels.csv


{'ARCHAIC_2labels.csv': b',url,clean text,Overlap,category,2nd category,Label,2nd_Label,twin label,predicted\n0,https://www.archaeology.org/news/8852-200714-england-henge-skeleton,"Finds Spanning 4,000 Years Unearthed in Southeast England. BUCKINGHAMSHIRE, ENGLAND\xe2\x80\x94 BBC News reports that an excavation in southeast England under the direction of archaeologist Rachel Wood, ahead of the construction of a high-speed railway line, has uncovered evidence of occupation spanning a 4,000-year period. The finds include traces of a Neolithic wooden henge whose features are aligned with the winter solstice, and the 2,000-year-old remains of a man who was buried face down with his hands bound behind his back. Wood thinks he may have been murdered. \xe2\x80\x9cWe hope our osteologists will be able to shed more light on this potentially gruesome death,\xe2\x80\x9d she said. In addition to the Iron Age skeleton, the project has uncovered a gold coin dated to about 100 B.C., and a Roman-era l

In [34]:
# Contains test data set which is somewhat more difficult than any validation set
# Test data set has only 1 label per news story
Easy_test = pd.read_csv('ARCHAIC_test.csv')

In [103]:
# A place to assure there are no duplicates from the training set in the test sets
DUP = []
arch_testing = list(Easy_test['url'])
for j in range(len(data['url'])):
  if data.at[j,'url'] in arch_testing:
    DUP.append(data.at[j,'url'])
    print(j)
  else:
    pass
  #ORI.append(data.at[j,'url'])
DUP

[]

In [36]:
# Create a numerical label column for the Easy_test table
Easy_test, Easy_len=col_label(Easy_test)

# encode the texts in Easy_test
test_input_ids, test_attention_masks, test_labels = Encode(Easy_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [94]:
# Predicting news category on the test set
Easy_test_pred = bert_model([test_input_ids,test_attention_masks])

# Add predicted label and category columns to the Easy_test table
es_predicted, es_pred_cat, es_true_cat = rev_label(Easy_test_pred, Easy_test['clean text'], test_labels)

Easy_test['predicted'] = es_predicted
Easy_test['true_cat'] = es_true_cat
Easy_test['pred_cat'] = es_pred_cat

In [None]:
print('Wrongly predicted news pieces from test set:',len(Easy_test[Easy_test['Label']!=Easy_test['predicted']]))
Easy_test[Easy_test['Label']!=Easy_test['predicted']][['clean text','category','pred_cat']]

In [96]:
# Accuracy for the main test set (1 label)

ACCU = sklearn.metrics.accuracy_score(Easy_test['Label'], Easy_test['predicted'], normalize=True, sample_weight=None)
print('Easy_test Accuracy =',round((ACCU*100),2))

Easy_test Accuracy = 86.77


There are news stories which cannot be assigned to a single category as the content shows an overlap of 2 categories or more. Each news piece in the test data set below has been given 2 labels only for simplicity even though some of them are relevant to 3 categories. The news stories below are considered much more difficult to classify than the single-label test data set.

In [41]:
# A different test data set with 2 classes assgined to each news story
Hard_test = pd.read_csv('ARCHAIC_2labels.csv')

# Assign the primary label first (not necessarily the dominant class)
Hard_test, Hard_over_384 = col_label(Hard_test)

In [42]:
# Set up the numerical label column for the 2nd category
Hard_test['2nd_Label'] = 0 # want integers only
pair = []

for i in Hard_test.index:
      
  Hard_test.at[i,'2nd_Label'] = CL[Hard_test.at[i,'2nd category']]
  b = Hard_test.at[i,'2nd_Label']
  a = Hard_test.at[i,'Label']
  twin = [a,b]
  pair.append(twin)

# Add a column with a list of two numerical labels for each news piece
Hard_test['twin label'] = pair

In [89]:
Hard_input_ids,Hard_attention_masks, Hard_labels = Encode(Hard_test)
Hard_test_pred = bert_model([Hard_input_ids,Hard_attention_masks])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [90]:
# Collect predicted twin labels using the argsort function. The last two positions in the argsort results give max. and 2nd highest probability values
hd_predicted =[]

for l in range(len(Hard_test)):
  twin_index= (np.argsort(Hard_test_pred[0][l]))[-2 :]
  hd_predicted.append(twin_index)

Hard_test['predicted'] = hd_predicted  

In [104]:
# Rearrange the order of columns in the table for readability
Hard_test = Hard_test[['url','clean text','Overlap','category','2nd category','Label','2nd_Label','twin label','predicted']]

In [105]:
Hard_test[['clean text','category','2nd category','Label','2nd_Label','twin label','predicted']]

Unnamed: 0,clean text,category,2nd category,Label,2nd_Label,twin label,predicted
0,"Finds Spanning 4,000 Years Unearthed in Southe...",Neolithic,Iron Age,9,0,"[9, 0]","[9, 0]"
1,Genome Study Yields New Date for Arrival of TB...,Medieval,Neolithic,6,9,"[6, 9]","[6, 9]"
2,Early Christian Chalice Unearthed in Northern ...,Roman,Medieval,12,6,"[12, 6]","[11, 12]"
3,Multidisciplinary Study Tracks Spread of Rice ...,East Asia,South&Southeast Asia,1,2,"[1, 2]","[7, 2]"
4,Possible Ancient Longboat Recovered in Ireland...,Neolithic,Medieval,9,6,"[9, 6]","[13, 9]"
5,Three Well-Preserved Ancient Boats Unearthed i...,Roman,Medieval,12,6,"[12, 6]","[11, 12]"
6,Leather Balls Found in Ancient Graves in North...,Bronze Age,East Asia,13,1,"[13, 1]","[13, 1]"
7,Volunteers Spot New Sites in Aerial Images of ...,Roman,Medieval,12,6,"[12, 6]","[11, 12]"
8,"Snake Altar Unearthed in Turkey. ANTALYA, TURK...",Roman,Iron Age,12,0,"[12, 0]","[13, 0]"
9,Study Examines Food and Gender in Bronze Age C...,Bronze Age,East Asia,13,1,"[13, 1]","[1, 13]"


In [101]:
# Dictionary reference
CL

{'Bronze Age': 13,
 'East Asia': 1,
 'Egyptian': 5,
 'Iron Age': 0,
 'Medieval': 6,
 'Modern': 8,
 'Native American': 10,
 'Near East': 11,
 'Neolithic': 9,
 'Paleolithic': 7,
 'Pre-Columbian': 4,
 'Pre-Human': 3,
 'Roman': 12,
 'South&Southeast Asia': 2}

In [93]:
# Custom scoring - both labels correct - 1, only one label correct - 0.5

score = 0

for j in Hard_test.index:
  if (Hard_test.at[j,'predicted'])[0] in Hard_test.at[j,'twin label']:
    score += 0.5
  else:
    pass
  if (Hard_test.at[j,'predicted'])[1] in Hard_test.at[j,'twin label']:
    score += 0.5
  else:
    pass

print('two-label accuracy =', round(score/(len(Hard_test))*100,2))


two-label accuracy = 72.86
