# Learn and evaluate a classifier for scigraph articles using surface form annotations.
Articles are classified in any of the 22 first level categories in which they are categorized in Scigprah. Previously we have extracted from scrigraph the papers published in 2011 and for each paper we consider only the text in the title and the abstract. 

## Import the required libraries

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from keras.models import Model, Sequential
from keras.metrics import categorical_accuracy
from keras.utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from random import sample
import numpy as np
import json
import re
import h5py
import mmap

# Download data: surface form embeddings, and scigraph papers

## Downloading data from google drive.

In [3]:
pip install gdown



In [4]:
import gdown
url = 'https://drive.google.com/uc?id=1MRL2mYnJUb-qGitAZ53BFeNi4HLyqKlN'
out = 'data-embeddings.zip'
gdown.download(url,out,False)

Downloading...
From: https://drive.google.com/uc?id=1MRL2mYnJUb-qGitAZ53BFeNi4HLyqKlN
To: /content/data-embeddings.zip
1.13GB [00:45, 24.8MB/s]


'data-embeddings.zip'

unzip the content and set the variables that points to the data and embeddings

In [5]:
!unzip data-embeddings.zip

Archive:  data-embeddings.zip
  inflating: data/scigraph-2011-sf.json  
  inflating: embeddings/row_embedding.tsv  


In [0]:
dataset_file="data/scigraph-2011-sf.json"
embeddings_file="embeddings/row_embedding.tsv"

## Read and prepare the classification dataset
To speed up the classifier learning process we take a sampe of the whole dataset. If you want to use the whole dataset please comment the second-to-last line below. 



In [14]:
sample_size = 10000
texts = []
labels_index = {}
labels = []
word_index = {}

# Read the articles dataset that will be used to train and validate the model.
with open(dataset_file, "r", encoding="utf-8", errors="surrogatepass") as file:
  dataset = json.load(file)

file.close()

#Prepare data
for doc in tqdm(dataset,total = len(dataset), desc="extracting labels") :
  # Extract the 2-number field code, that is, the most general one.
  fields = [x for x in doc["fieldcodes"] if len(x)==2]
  label_ids = set()
  for field in fields:
      # Check if the field is already stored and if not, assign a new label to it.
      if field not in labels_index:
          label_id = len(labels_index)
          labels_index[field] = label_id
      else:
          label_id = labels_index[field]
      # Add the corresponding field label
      label_ids.add(label_id)
  labels.append(label_ids)
  # Extract the title and abstract of each article
  texts.append(doc["sf"])

#To speed up the training process we obtain a sample of sample_size of the data. 
#To work with the full dataset comment the line below
labels, texts = zip(*sample(list(zip(labels, texts)), sample_size))
print('\n'+str(len(texts))+' papers')  



extracting labels: 100%|██████████| 187795/187795 [00:00<00:00, 350375.49it/s]


10000 papers





#### get data and label tensor, plus fkold (using tokenizer)

In [15]:
max_nb_words = 40000
max_sequence_length = 1000
#estandar keras tokenizer filters except the + symbol which is used in our sf to glue multiword expressions
tokenizer_filters = '!"#$%&()*,-./:;<=>?@[\\]^_`{|}~\t\n'

# Tokenize the sentences of all the articles
tokenizer = Tokenizer(num_words=max_nb_words, filters=tokenizer_filters)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Get the vocabulary index
word_index = { w:c for (w,c) in tokenizer.word_index.items() if c < max_nb_words}

print("Found %s unique tokens." % len(word_index))

# Fit the sequences into the maximum length
data = pad_sequences(sequences, maxlen=max_sequence_length, padding="post", truncating="post")
print("Shape of data tensor:", data.shape)

# Transform the labels into a binary vector, with one element for each category
mlb = MultiLabelBinarizer()
labels_cat = mlb.fit_transform(labels)

print("Shape of label tensor:", labels_cat.shape)

print("Found %s unique tokens." % len(word_index))

Found 39999 unique tokens.
Shape of data tensor: (10000, 1000)
Shape of label tensor: (10000, 22)
Found 39999 unique tokens.


Glance at the vocabulary gathered by the tokenizer. Note that surface forms of multiword expression use the + symbol to concatenate the single words.

In [16]:
list(word_index.keys())[-20:]

['moseri',
 'gilbert',
 'acanthocephalan',
 'obscurus',
 'rbcs',
 'inelastic+collision',
 'percutaneous+coronary+intervention',
 'october+2006',
 'pisaura',
 'mirabilis',
 'slits',
 'polaritons',
 'spps',
 'cell+division+cycle',
 'side+effect',
 'photochemical+reaction',
 'sulfosalicylaldehyde',
 'displace',
 'israeli+ibd',
 'electrophiles']

## Surface form embeddings
In the following we use pre-trained vecisigrafo (Surface form) embeddings learned from scigraph. 

### Load vecsigrafo embeddings

In [17]:
dimensions = 300

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

file_size = get_num_lines (embeddings_file)
print("loading file"+embeddings_file)

# Load the word embeddings
file = open(embeddings_file, "r", encoding="utf-8", errors="surrogatepass")
embeddings_index = {}
#for line in tqdm(file, total = file_size, desc="processing lines in embeddings file") :      

with open(embeddings_file) as infile:
    for line in tqdm(infile, total = file_size, desc="Embeddings file") :
        values = line.split()
        wordlimit=len(values)-dimensions
        vector = np.asarray(values[wordlimit:], dtype='float32')
        word = values[0]
        index=0
        for value in values[1:wordlimit]:
            word = word + "+"+value
        embeddings_index[word] = vector        

print('Found %s word vectors.' % len(embeddings_index))      


Embeddings file:   0%|          | 1302/692224 [00:00<00:53, 13014.30it/s]

loading fileembeddings/row_embedding.tsv


Embeddings file: 100%|██████████| 692224/692224 [00:48<00:00, 14275.49it/s]

Found 692214 word vectors.





Glance at some of the surface forms contained in the embeddings files

In [21]:
l=[w for w,e in embeddings_index.items()]
print(l[10000:10100])


['irrigated', 'sequelae', 'chronic+diseases', 'landolt-börnstein+homepage+volume+iv', 'tick', 'vte', 'until+now', 'bootstrap', '2+%', 'authentic', 'single-crystal', 'social+sciences', 'streptomyces', 'florida', 'revisited', 'septic', 'vertically', 'object-oriented', 'ast', 'inevitably', 'proteasome', 'relatedness', 'rms', 'sixth', 'gesture', 'microalgae', 'at+large', 'constrain', 'locomotor', 'gather', 'rituximab', 'interspecific', 'k.', 'cows', 'minutes', '75+%', 'norwegian', 'resource+management', 'soliton', 'anodic', 'everyone', 'implements', 'outsourcing', 'incoming', 'nationally', 'traction', 'workpiece', 'oscillators', 'bear', 'empire', 'breeds', 'ppar_', 'cerebrovascular', 'striatum', 'propositions', 'seminal', 'tha', 'compressive+strength', 'specialty', 'fiscal', 'h2s', 'methodspatients', 'replicated', 'bus', 'glial', 'goats', 'obligations', 'scanner', 'epitope', 'input+data', 'om', 'electron+beam', 'surface+layer', 'diesel', 'air+quality', 'empowerment', 'nocturnal', 'focus+gr

## Create the Embedding layer

In [22]:
# Create a matrix with all the embeddings corresponding to all the vocabulary words
embedding_dimensions = len(list(embeddings_index.values())[0])

#dictionary_size = len(word_index) 
dictionary_size = list(word_index.values())[-1]

print("dim ->"+str(embedding_dimensions))
print("word_index len ->"+str(len(word_index) + 1))
print("last position in the dictionary ->"+ str(dictionary_size))

embedding_matrix = np.zeros((dictionary_size + 1, embedding_dimensions))
for word, i in word_index.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:        
        # Words not found in the embedding index will be all-zeros        
        embedding_matrix[i] = embedding_vector
        
# Create an embedding layer based on the embedding matrix
# This layer is not trainable: the embeddings will not be changed during training time
embedding_layer = Embedding(dictionary_size + 1,
                                 embedding_dimensions,
                                 weights = [embedding_matrix],
                                 input_length = max_sequence_length,
                                 trainable = False)

dim ->300
word_index len ->40000
last position in the dictionary ->39999



## Train a Convolutional Neural Network

In [23]:
precisions = []
recalls = []
f1s = []
kfold = KFold(n_splits=10, shuffle=True)

for train, test in kfold.split(data, labels_cat):  
  # Define, train and validate the neural network model
  sequence_input = Input(shape=(max_sequence_length,), dtype="int32")
  embedded_sequences = embedding_layer(sequence_input)
  x = Conv1D(128, 5, activation="relu")(embedded_sequences)
  x = MaxPooling1D(5)(x)
  x = Conv1D(128, 5, activation="relu")(x)
  x = MaxPooling1D(5)(x)
  x = Conv1D(128, 5, activation="relu")(x)
  x = MaxPooling1D(35)(x)
  x = Flatten()(x)
  #x = Dropout(0.2)(x)
  x = Dense(128, activation="relu")(x)
  #x = Dropout(0.2)(x)
  preds = Dense(len(labels_index), activation="sigmoid")(x)
  model = Model(sequence_input, preds)
  model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=[categorical_accuracy])
#print (model.summary())
  model.fit(data[train], labels_cat[train], validation_data=(data[test], labels_cat[test]),
            epochs=5, batch_size=128)

  # Evaluate the model assigning zeros and ones according to a threshold
  pred = model.predict(data[test], batch_size=128)
  pred[pred >= 0.5] = 1
  pred[pred < 0.5] = 0
  print(classification_report(labels_cat[test], pred, digits=4))
  precisions.append(precision_score(labels_cat[test], pred, average="weighted"))
  recalls.append(recall_score(labels_cat[test], pred, average="weighted"))
  f1s.append(f1_score(labels_cat[test], pred, average="weighted"))
print("Precision: %.4f (+/- %.4f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.4f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.4f)" % (np.mean(f1s), np.std(f1s)))












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 9000 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0     0.8595    0.9007    0.8796       292
           1     0.8871    0.3929    0.5446       140
           2     0.7864    0.7788    0.7826       104
           3     1.0000    0.0526    0.1000        38
           4     0.8214    0.6133    0.7023        75
           5     0.6667    0.7742    0.7164        31
           6     0.7861    0.8395    0.8119       162
           7     0.0000    0.0000    0.0000        18
           8     0.2857    0.3333    0.3077         6
           9     0.7778    0.1061    0.1867        66
          10     0.7105    0.5294    0.6067        51
          11     0.0000    0.0000    0.0000        19
          12     0.0000    0.0000    0.0000         6
          13     0.4103    0.8889    0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Train on 9000 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0     0.8754    0.8613    0.8683       310
           1     0.8261    0.3140    0.4551       121
           2     0.5353    0.8835    0.6667       103
           3     1.0000    0.0278    0.0541        36
           4     0.7500    0.4390    0.5538        82
           5     0.7381    0.7561    0.7470        41
           6     0.9043    0.4802    0.6273       177
           7     0.0000    0.0000    0.0000        15
           8     0.0000    0.0000    0.0000        17
           9     0.5323    0.6226    0.5739        53
          10     0.6000    0.5581    0.5783        43
          11     1.0000    0.0500    0.0952        20
          12     0.0000    0.0000    0.0000         4
          13     0.6667    0.5833    0.6222        24
          14     0.0000    0.0000    0.0000        14
          15     0.0000    0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Train on 9000 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.9529    0.7642    0.8482       318
           1     0.7143    0.6115    0.6589       139
           2     0.6284    0.8611    0.7266       108
           3     0.0000    0.0000    0.0000        37
           4     0.5263    0.7692    0.6250        65
           5     0.8000    0.2963    0.4324        27
           6     0.8380    0.7532    0.7933       158
           7     0.0000    0.0000    0.0000        12
           8     0.2500    0.3529    0.2927        17
           9     0.8750    0.1228    0.2154        57
          10     0.9167    0.2245    0.3607        49
          11     0.4444    0.1739    0.2500        23
          12     0.0000    0.0000    0.0000         4
          13     0.6154    0.2759    0.3810        29
          14     0.0000    0.0000    0.0000        12
          15     0.0000    0.0000    0.0000         3
          16     0.6000    0.3750    0.4615         8
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.8907    0.8445    0.8670       328
           1     0.7545    0.6336    0.6888       131
           2     0.7969    0.5152    0.6258        99
           3     0.4923    0.7619    0.5981        42
           4     0.6449    0.8023    0.7150        86
           5     0.7500    0.7241    0.7368        29
           6     0.8403    0.7423    0.7883       163
           7     0.0000    0.0000    0.0000        14
           8     0.0000    0.0000    0.0000        22
           9     0.5455    0.5769    0.5607        52
          10     0.6364    0.7000    0.6667        40
          11     0.0000    0.0000    0.0000        15
          12     0.0000    0.0000    0.0000         5
          13     1.0000    0.0714    0.1333        28
          14     0.0000    0.0000    0.0000        11
          15     0.0000    0.0000    0.0000         4
          16     0.2308    0.5000    0.3158         6
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.8953    0.8604    0.8775       308
           1     0.8462    0.3259    0.4706       135
           2     0.6557    0.7692    0.7080       104
           3     0.4118    0.2692    0.3256        26
           4     0.6316    0.6076    0.6194        79
           5     0.5143    0.6667    0.5806        27
           6     0.9175    0.5298    0.6717       168
           7     0.0000    0.0000    0.0000        21
           8     0.8889    0.5000    0.6400        16
           9     0.6667    0.5098    0.5778        51
          10     0.8056    0.5472    0.6517        53
          11     0.0000    0.0000    0.0000        17
          12     0.0000    0.0000    0.0000         7
          13     0.7500    0.0938    0.1667        32
          14     0.0000    0.0000    0.0000        14
          15     0.0000    0.0000    0.0000         3
          16     1.0000    0.4545    0.6250        11
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.7455    0.9452    0.8336       310
           1     0.9512    0.3145    0.4727       124
           2     0.6496    0.7170    0.6816       106
           3     0.6452    0.5882    0.6154        34
           4     0.7400    0.4744    0.5781        78
           5     0.9167    0.2895    0.4400        38
           6     0.8759    0.7643    0.8163       157
           7     0.0000    0.0000    0.0000        21
           8     1.0000    0.2400    0.3871        25
           9     0.5667    0.3036    0.3953        56
          10     0.4643    0.6190    0.5306        42
          11     0.3889    0.4118    0.4000        17
          12     0.0000    0.0000    0.0000         7
          13     0.4737    0.3600    0.4091        25
          14     0.0000    0.0000    0.0000        12
          15     0.0000    0.0000    0.0000        12
          16     1.0000    0.2500    0.4000        12
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.8806    0.8750    0.8778       312
           1     0.6084    0.7768    0.6824       112
           2     0.7778    0.5138    0.6188       109
           3     1.0000    0.0541    0.1026        37
           4     0.6456    0.7969    0.7133        64
           5     0.8500    0.4857    0.6182        35
           6     0.8526    0.4880    0.6207       166
           7     0.0000    0.0000    0.0000        12
           8     0.0000    0.0000    0.0000        26
           9     0.9286    0.1940    0.3210        67
          10     0.7714    0.5510    0.6429        49
          11     1.0000    0.1200    0.2143        25
          12     0.0000    0.0000    0.0000         7
          13     1.0000    0.1667    0.2857        30
          14     0.0000    0.0000    0.0000        13
          15     0.0000    0.0000    0.0000         3
          16     0.0000    0.0000    0.0000        14
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.9451    0.8543    0.8974       302
           1     0.7179    0.4786    0.5744       117
           2     0.6950    0.8522    0.7656       115
           3     0.4211    0.7619    0.5424        42
           4     0.6349    0.6250    0.6299        64
           5     0.8571    0.2791    0.4211        43
           6     0.8923    0.6480    0.7508       179
           7     0.0000    0.0000    0.0000        12
           8     0.0000    0.0000    0.0000        20
           9     0.7241    0.4118    0.5250        51
          10     0.9091    0.2041    0.3333        49
          11     0.0000    0.0000    0.0000        14
          12     0.0000    0.0000    0.0000         8
          13     0.6000    0.0750    0.1333        40
          14     0.0000    0.0000    0.0000        14
          15     0.0000    0.0000    0.0000         3
          16     0.0000    0.0000    0.0000         9
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

           0     0.9536    0.7459    0.8370       303
           1     0.8913    0.2847    0.4316       144
           2     0.7937    0.4902    0.6061       102
           3     0.4386    0.8621    0.5814        29
           4     0.3242    0.9219    0.4797        64
           5     0.9000    0.6000    0.7200        30
           6     0.7725    0.8670    0.8170       188
           7     0.0000    0.0000    0.0000        16
           8     0.8000    0.2105    0.3333        19
           9     0.7297    0.4355    0.5455        62
          10     0.8462    0.2075    0.3333        53
          11     0.0000    0.0000    0.0000        13
          12     0.0000    0.0000    0.0000         8
          13     1.0000    0.1304    0.2308        23
          14     0.0000    0.0000    0.0000         9
          15     0.0000    0.0000    0.0000         5
          16     0.0000    0.0000    0.0000         9
          17     0.0000    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
