In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import xml.etree.ElementTree as ET

In [0]:
f = open('drive/My Drive/AFC/embeddings/glove.6B.100d.txt', 'r', encoding='utf8')

In [0]:
lines = f.readlines()

In [0]:
word2index = {}
embeddings = np.zeros(shape=(len(lines) + 1, len(lines[0].split()[1:])), dtype=np.float32)
for idx, line in enumerate(lines):
    line = line.split()
    word2index[line[0]] = len(word2index) + 1
    embeddings[idx + 1] = np.array(line[1:], dtype=np.float32)

In [0]:
root = ET.parse('drive/My Drive/AFC/dataset/semcor.data.xml').getroot()

In [0]:
#loading training set
ambiguous_words = set()
synset2index = {'_': 1}
pos2index = {}
f = open('drive/My Drive/AFC/dataset/semcor.gold.key.bnids.txt', 'r', encoding='utf8')
sentences = []
pos = []
labels = []
for sentence in root.findall('text/sentence'):
    s = []
    p = []
    l = []
    for word in sentence:
        w = word.attrib['lemma'].lower()
        tag = word.attrib['pos'].lower()
        s.append(word2index[w]) if w in word2index else s.append(word2index['unk'])
        if tag not in pos2index:
            pos2index[tag] = len(pos2index) + 1
        p.append(pos2index[tag])
        if 'id' in word.attrib:
            synset = f.readline().split()[1]
            
            if w in word2index:
                ambiguous_words.add(word2index[w])

            if synset not in synset2index:
                synset2index[synset] = len(synset2index) + 1
            l.append(synset2index[synset])
        else:
            l.append(synset2index['_'])
    sentences.append(s)
    pos.append(p)
    labels.append(l)

In [0]:
root = ET.parse('drive/My Drive/AFC/dataset/ALL.data.xml').getroot()

In [0]:
#loading test set
f = open('drive/My Drive/AFC/dataset/ALL.gold.key.bnids.txt', 'r', encoding='utf8')
test = {}
for sentence in root.findall('text/sentence'):
    s = []
    p = []
    l = []
    dataset = sentence.attrib['id'].split('.')[0]
    if dataset not in test:
        test[dataset] = []
    for word in sentence:
        w = word.attrib['lemma'].lower()
        tag = word.attrib['pos'].lower()
        s.append(word2index[w]) if w in word2index else s.append(word2index['unk'])
        p.append(pos2index[tag])
        if 'id' in word.attrib:
            synset = f.readline().split()[1]
            l.append(synset2index[synset]) if synset in synset2index else l.append(synset2index['_'])
        else:
            l.append(synset2index['_'])
    test[dataset].append((s, p, l))

In [0]:
import tensorflow as tf

In [11]:
tf.__version__

'2.1.0'

In [0]:
import tqdm

In [0]:
def metrics(y_true, y_pred, neg_label=1):
    assert(len(y_true) == len(y_pred))
    tp = 0
    fp = 0
    fn = 0
    for true, pred in zip(y_true, y_pred):
        if true == pred:
            if pred != neg_label:
                tp += 1
        else:
            if pred != neg_label:
                fp += 1
            else:
                fn += 1
    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except:
        f1 = 0
        
    return f1, precision, recall

In [0]:
batch_size = 16
hidden_size = 100

word_ids = tf.keras.Input([None], dtype=tf.int32)
pos_ids = tf.keras.Input([None], dtype=tf.int32)
flags = tf.keras.Input([None], dtype=tf.int32)

pretrained_emb = tf.keras.layers.Embedding(embeddings.shape[0], embeddings.shape[1], weights=[embeddings], mask_zero=True, trainable=False)(word_ids)
pos_emb = tf.keras.layers.Embedding(len(pos2index) + 1, 50, mask_zero=True)(pos_ids)
final_emb = tf.keras.layers.Concatenate(axis=-1)([pretrained_emb, pos_emb, tf.cast(tf.expand_dims(flags, axis=-1), tf.float32)])

bid = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size, return_sequences=True))(final_emb)
scores = tf.keras.layers.Dense(len(synset2index) + 1)(bid)

In [0]:
model = tf.keras.Model(inputs=[word_ids, pos_ids, flags], outputs=scores)

In [0]:
def masked_loss(x):
    
    def loss(labels, logits):
        #print('logits ',logits.get_shape())
        #print('labels ', labels.get_shape())
        #output_shape = logits.get_shape()
        #sequence = tf.count_nonzero(x, axis=-1, dtype=tf.int32)
        not_zeros = tf.cast(tf.not_equal(x, 0), tf.int32)
        sequence = tf.reduce_sum(not_zeros, axis=-1)
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
        #losses = tf.keras.losses.categorical_crossentropy(tf.cast(labels, tf.float32), logits, from_logits=True)
        mask = tf.sequence_mask(sequence)
        #masked_losses = tf.where(mask, losses, tf.zeros_like(losses))
        masked_losses = tf.multiply(losses, tf.cast(not_zeros, tf.float32))
        
        return masked_losses
    
    return loss
    
model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss=masked_loss(x=word_ids), 
              target_tensors=tf.keras.Input([None], dtype=tf.int32))

In [17]:
steps = len(sentences) // batch_size
epochs = 5
for epoch in range(epochs):
  avg_loss = 0
  for step in tqdm.tqdm(range(steps), desc='Epoch ' + str(epoch + 1) + '/' + str(epochs)):
      
      l = model.train_on_batch(x=[tf.keras.preprocessing.sequence.pad_sequences(sentences[step * batch_size: (step + 1) * batch_size], padding='post'), 
                                  tf.keras.preprocessing.sequence.pad_sequences(pos[step * batch_size: (step + 1) * batch_size], padding='post'), 
                                  tf.keras.preprocessing.sequence.pad_sequences([[0 if l == 1 else 1 for l in sent] for sent in labels[step * batch_size: (step + 1) * batch_size]], padding='post')], 
                               y=tf.keras.preprocessing.sequence.pad_sequences(labels[step * batch_size: (step + 1) * batch_size], padding='post'))
          
      avg_loss += l
      
      if (step > 0) and step % 500 == 0 or (step == steps - 1):
          print('Loss:', avg_loss / step)
              
  for dataset in test:
          y_true = []
          y_pred = []
          for x, z, y in test[dataset]:
              pred = model.predict([x, z, [0 if l == 1 else 1 for l in y]])
              pred = np.argmax(pred, axis=-1).reshape(-1)
              
              y_true += y
              y_pred += pred.tolist()
          
          f1, precision, recall = metrics(y_true, y_pred)
              
          print(dataset + ' results ')
          print('f1: ' + str(f1) + '\t' + 'precision: ' + str(precision) + '\t' + 'recall: ' + str(recall))

Epoch 1/5:  22%|██▏       | 501/2323 [06:01<22:53,  1.33it/s]

Loss: 2.0340850220918654


Epoch 1/5:  43%|████▎     | 1001/2323 [11:47<14:23,  1.53it/s]

Loss: 1.8421773623228073


Epoch 1/5:  65%|██████▍   | 1501/2323 [18:00<09:24,  1.46it/s]

Loss: 1.535846330548326


Epoch 1/5:  86%|████████▌ | 2001/2323 [24:11<03:48,  1.41it/s]

Loss: 1.2184740062057973


Epoch 1/5: 100%|██████████| 2323/2323 [28:22<00:00,  1.36it/s]

Loss: 1.0816962966466006





senseval2 results 
f1: 0.1313937118723604	precision: 0.16887816646562123	recall: 0.10752688172043011
senseval3 results 
f1: 0.18172268907563024	precision: 0.2064439140811456	recall: 0.1622889305816135
semeval2007 results 
f1: 0.26953125	precision: 0.21100917431192662	recall: 0.372972972972973
semeval2013 results 
f1: 0.004672897196261683	precision: 0.007575757575757576	recall: 0.0033783783783783786


Epoch 2/5:   0%|          | 0/2323 [00:00<?, ?it/s]

semeval2015 results 
f1: 0.16210739614994932	precision: 0.18306636155606407	recall: 0.14545454545454545


Epoch 2/5:  22%|██▏       | 501/2323 [06:04<23:00,  1.32it/s]

Loss: 1.113271123468876


Epoch 2/5:  43%|████▎     | 1001/2323 [11:52<14:32,  1.52it/s]

Loss: 0.9858680567294359


Epoch 2/5:  65%|██████▍   | 1501/2323 [18:04<09:27,  1.45it/s]

Loss: 0.809930143430829


Epoch 2/5:  86%|████████▌ | 2001/2323 [24:14<03:49,  1.40it/s]

Loss: 0.642880002759397


Epoch 2/5: 100%|██████████| 2323/2323 [28:24<00:00,  1.36it/s]

Loss: 0.5716643155432923





senseval2 results 
f1: 0.6664420485175203	precision: 0.5069195284469503	recall: 0.9724680432645034
senseval3 results 
f1: 0.6251993620414673	precision: 0.46445497630331756	recall: 0.9560975609756097
semeval2007 results 
f1: 0.5586776859504132	precision: 0.3939393939393939	recall: 0.9602272727272727
semeval2013 results 
f1: 0.4971590909090909	precision: 0.34224250325945244	recall: 0.9083044982698962


Epoch 3/5:   0%|          | 0/2323 [00:00<?, ?it/s]

semeval2015 results 
f1: 0.6007843137254902	precision: 0.4417531718569781	recall: 0.9387254901960784


Epoch 3/5:  22%|██▏       | 501/2323 [06:03<23:05,  1.31it/s]

Loss: 0.5349885964989662


Epoch 3/5:  43%|████▎     | 1001/2323 [11:49<14:25,  1.53it/s]

Loss: 0.48320293962210414


Epoch 3/5:  65%|██████▍   | 1501/2323 [18:01<09:26,  1.45it/s]

Loss: 0.4099647866698603


Epoch 3/5:  86%|████████▌ | 2001/2323 [24:13<03:50,  1.40it/s]

Loss: 0.3329567918134853


Epoch 3/5: 100%|██████████| 2323/2323 [28:25<00:00,  1.36it/s]

Loss: 0.30009543710018444





senseval2 results 
f1: 0.728446884081369	precision: 0.5734621250635485	recall: 0.9982300884955753
senseval3 results 
f1: 0.7303703703703704	precision: 0.576271186440678	recall: 0.9969666329625885
semeval2007 results 
f1: 0.6561514195583598	precision: 0.49056603773584906	recall: 0.9904761904761905
semeval2013 results 
f1: 0.6265286923800565	precision: 0.4564770390678547	recall: 0.9985007496251874


Epoch 4/5:   0%|          | 0/2323 [00:00<?, ?it/s]

semeval2015 results 
f1: 0.6878698224852071	precision: 0.5248306997742663	recall: 0.9978540772532188


Epoch 4/5:  22%|██▏       | 501/2323 [06:03<22:55,  1.32it/s]

Loss: 0.31578901056945324


Epoch 4/5:  43%|████▎     | 1001/2323 [11:50<14:30,  1.52it/s]

Loss: 0.2971163189187646


Epoch 4/5:  65%|██████▍   | 1501/2323 [18:03<09:33,  1.43it/s]

Loss: 0.26165388637408615


Epoch 4/5:  86%|████████▌ | 2001/2323 [24:13<03:49,  1.40it/s]

Loss: 0.21781073295511305


Epoch 4/5: 100%|██████████| 2323/2323 [28:24<00:00,  1.36it/s]

Loss: 0.199008177731645





senseval2 results 
f1: 0.7564832384566731	precision: 0.6083418107833164	recall: 1.0
senseval3 results 
f1: 0.7486278814489572	precision: 0.5982456140350877	recall: 1.0
semeval2007 results 
f1: 0.6792452830188679	precision: 0.5142857142857142	recall: 1.0
semeval2013 results 
f1: 0.6962264150943397	precision: 0.5340086830680174	recall: 1.0


Epoch 5/5:   0%|          | 0/2323 [00:00<?, ?it/s]

semeval2015 results 
f1: 0.7056245434623813	precision: 0.5451467268623025	recall: 1.0


Epoch 5/5:  22%|██▏       | 501/2323 [06:04<23:20,  1.30it/s]

Loss: 0.23554359683394432


Epoch 5/5:  43%|████▎     | 1001/2323 [11:51<14:33,  1.51it/s]

Loss: 0.22811869512125849


Epoch 5/5:  65%|██████▍   | 1501/2323 [18:05<09:26,  1.45it/s]

Loss: 0.20484205331653357


Epoch 5/5:  86%|████████▌ | 2001/2323 [24:19<03:51,  1.39it/s]

Loss: 0.17296274799481035


Epoch 5/5: 100%|██████████| 2323/2323 [28:31<00:00,  1.36it/s]

Loss: 0.15921017825707767





senseval2 results 
f1: 0.7580543272267846	precision: 0.6103763987792472	recall: 1.0
senseval3 results 
f1: 0.7466079941327466	precision: 0.5956699824458748	recall: 1.0
semeval2007 results 
f1: 0.691588785046729	precision: 0.5285714285714286	recall: 1.0
semeval2013 results 
f1: 0.7146193367585241	precision: 0.5559593023255814	recall: 1.0
semeval2015 results 
f1: 0.7365398420674802	precision: 0.5829545454545455	recall: 1.0


In [0]:
import os

In [0]:
os.mkdir('drive/My Drive/AFC/models')
model.save('drive/My Drive/AFC/models/wsd.h5')

In [21]:
for dataset in test:
    y_true = []
    y_pred = []
    for se, po, la in test[dataset]:
        for x, y in zip(se, la):
            y_pred.append(1) if x in ambiguous_words else y_pred.append(0)
            y_true.append(0) if y == 1 else y_true.append(1)
    print(dataset + ' results ')
    f1, precision, recall = metrics(y_true, y_pred, neg_label=0)
    print('f1: ' + str(f1) + '\t' + 'precision: ' + str(precision) + '\t' + 'recall: ' + str(recall))

senseval2 results 
f1: 0.7417968750000001	precision: 0.6019017432646593	recall: 0.966412213740458
senseval3 results 
f1: 0.6962106615285807	precision: 0.5487681403982451	recall: 0.9519906323185011
semeval2007 results 
f1: 0.3814133591481123	precision: 0.2387878787878788	recall: 0.9471153846153846
semeval2013 results 
f1: 0.44012944983818764	precision: 0.2872387727879057	recall: 0.9410050983248361
semeval2015 results 
f1: 0.7450287229341582	precision: 0.6077865897620764	recall: 0.9623287671232876


In [0]:
model.load_weights('drive/My Drive/AFC/models/wsd.h5')

In [24]:
target = [None] + [k for k in synset2index]
[target[num] for num in np.argmax(model.predict(x=[sentences[0], pos[0], [0 if w == 1 else 1 for w in labels[0]]]), axis=-1).reshape(-1)]

['_',
 'bn:00106124a',
 '_',
 '_',
 'bn:00083181v',
 '_',
 '_',
 'bn:00092618v',
 '_',
 'bn:00002179n',
 '_',
 '_',
 'bn:00009905n',
 '_',
 'bn:00070651n',
 'bn:00062759n',
 '_']

In [25]:
labels[0]

[1, 2, 1, 1, 3, 1, 1, 4, 1, 5, 1, 1, 6, 1, 7, 8, 1]

In [26]:
np.argmax(model.predict(x=[sentences[0], pos[0], [0 if w == 1 else 1 for w in labels[0]]]), axis=-1).reshape(-1).tolist()

[1, 2, 1, 1, 3, 1, 1, 4, 1, 5, 1, 1, 404, 1, 7834, 123, 1]