## fastText and GloVe embedding

## oov result

| No | stance dataset | embedding file | num of out of vocabularys | percentage |
| - | - | - | - | - |
| 1 | semeval2016 | glove/glove.6B.50d.txt | 1508/7929 | 0.190 |
| 2 | semeval2016 | glove/glove.twitter.27B.50d.txt | 1287/7929 | **0.162** |
| 3 | semeval2016 | fasttext/wiki-news-300d-1M.vec | 1544/7929 | 0.194 |
| 4 | semeval2016 | fasttext/crawl-300d-2M.vec | 1334/7929 | 0.168 |
| 5 | fnc-1 | glove/glove.6B.50d.txt | 3460/24209 | **0.143** |
| 6 | fnc-1 | glove/glove.twitter.27B.50d.txt | 4038/24209 | 0.167 |
| 7 | fnc-1 | fasttext/wiki-news-300d-1M.vec | 4501/24209 | 0.186 |
| 8 | fnc-1 | fasttext/crawl-300d-2M.vec | 3683/24209 | 0.152 |

In [1]:
# built-in module
import os
import pickle
import random

# 3rd-party module
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from tqdm import tqdm
import sklearn

# self-made module
import configs
import datas
import tokenizer
import embeddings

In [58]:
# hyperparameter setting
embedding_no = 8
config = configs.Config(stance_dataset='fnc-1',
                        embedding_file='fasttext/crawl-300d-2M.vec',
                        embedding_dim=300,
                        random_seed=7)

In [59]:
# define save path
save_path = f'embedding/{embedding_no}'
try:
    os.makedirs(save_path)
except:
    pass

In [60]:
# initialize random seed
os.environ['PYTHONHASHSEED'] = str(config.random_seed)
random.seed(config.random_seed)
np.random.seed(config.random_seed)
_ = torch.manual_seed(config.random_seed)

In [61]:
# load data
if config.stance_dataset == 'semeval2016':
    data_df = datas.load_dataset('semeval2016_train')
elif config.stance_dataset == 'fnc-1':
    data_df = datas.load_dataset('fnc_train')

loading FNC-1 training data: 100%|██████████| 49972/49972 [00:11&lt;00:00, 4341.16it/s]


In [62]:
# initialize tokenizer and embedding
tokenizers = tokenizer.Tokenizer()
embedding = embeddings.Embedding(config.embedding_dim,
                                 config.random_seed)

# get all tokens and embeddings
all_sentence = []
all_sentence.extend(data_df['target'].drop_duplicates().tolist())
all_sentence.extend(data_df['claim'].drop_duplicates().tolist())

tokenizers.get_all_tokens(all_sentence)
embedding.load_embedding(f'data/embedding/{config.embedding_file}',
                         tokenizers.all_tokens)

# build vocabulary dictionary
tokenizers.build_dict(all_sentence, embedding.word_dict)

get all tokens: 100%|██████████| 3312/3312 [00:00&lt;00:00, 101038.90it/s]
load embedding: 100%|█████████▉| 1999995/1999996 [00:21&lt;00:00, 94841.51it/s]


In [64]:
# get number of oov (out of vocabulary)
token_to_id = tokenizers.token_to_id
oov = [pair[0] for pair in token_to_id.items()
       if pair[1] == tokenizers.unk_token_id]

print(f'dataset: {config.stance_dataset}\n'
      f'embedding file: {config.embedding_file}\n'
      f'number of oov: {len(oov)} / {len(token_to_id)} ({len(oov)/len(token_to_id)})')

dataset: fnc-1
embedding file: fasttext/crawl-300d-2M.vec
number of oov: 3683 / 24209 (0.15213350406873477)


## ELMo embedding

## oov result

In [66]:
# built-in module
import os
import pickle
import random

# 3rd-party module
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from tqdm import tqdm
import sklearn

# 3rd-party module for AllenNLP
from allennlp.data import Token, Vocabulary
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
)
from allennlp.data.tokenizers import (
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
)
from allennlp.nn import util as nn_util

# self-made module
import configs
import datas
import tokenizer
import embeddings

In [5]:
# hyperparameter setting
embedding_no = 9
config = configs.Config(stance_dataset='semeval',
                        embedding_file='elmo/medium',
                        embedding_dim=256,
                        random_seed=7)

In [6]:
# initialize random seed
os.environ['PYTHONHASHSEED'] = str(config.random_seed)
random.seed(config.random_seed)
np.random.seed(config.random_seed)
_ = torch.manual_seed(config.random_seed)

In [7]:
# load data
if config.stance_dataset == 'semeval2016':
    data_df = datas.load_dataset('semeval2016_train')
elif config.stance_dataset == 'fnc-1':
    data_df = datas.load_dataset('fnc_train')

In [77]:
# define file path
if 'medium' in config.embedding_file:
    options_file = 'data/embedding/elmo/medium/elmo_2x2048_256_2048cnn_1xhighway_options.json'
    weight_file = 'data/embedding/elmo/medium/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5'

In [80]:
# tokenizer
tokenizer = WhitespaceTokenizer()

# Indexer (token to id)
token_indexer = ELMoTokenCharactersIndexer()

# vocabulary
vocab = Vocabulary()

In [114]:
text = "Thissssscaac is some text ."
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)

ELMo tokens: [Thissssscaac, is, some, text, .]


In [107]:
text_field = TextField(tokens, {'elmo_tokens': token_indexer})
text_field.index(vocab)

In [108]:
token_tensor = text_field.as_tensor(padding_lengths)
print("ELMo tensors:", token_tensor)

ELMo tensors: {&#39;elmo_tokens&#39;: {&#39;elmo_tokens&#39;: tensor([[259,  85, 105, 106, 116, 116, 116, 116, 116, 100,  98,  98, 100, 260,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 116, 112, 110, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 117, 102, 121, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261

In [97]:
elmo_embedding = ElmoTokenEmbedder(options_file=options_file,
                                   weight_file=weight_file)

In [98]:
embedder = BasicTextFieldEmbedder(token_embedders={'elmo_tokens': elmo_embedding})

In [109]:
tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("ELMo embedded tokens:", embedded_tokens)

ELMo embedded tokens: tensor([[[ 0.0000,  0.0330,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000, -0.2036,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [-0.2742,  0.0000, -0.6482,  ..., -0.0000,  0.1208,  0.0000],
         [-0.0000, -0.0000, -0.0000,  ..., -0.5129,  0.0000, -0.8754],
         [ 0.4370,  0.0000,  0.7828,  ...,  0.0000,  0.0000, -0.0000]]],
       grad_fn=&lt;CatBackward&gt;)


In [110]:
embedded_tokens[0][0]

tensor([ 0.0000e+00,  3.3030e-02,  0.0000e+00, -2.6042e-01,  0.0000e+00,
        -0.0000e+00, -2.9335e-01, -6.4144e-01,  0.0000e+00, -9.9162e-01,
        -0.0000e+00, -7.2166e-01,  5.6376e-01,  0.0000e+00, -1.3957e-01,
        -1.9405e-01, -1.8396e-01,  3.4711e-02,  6.2280e-01,  0.0000e+00,
        -9.1091e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        -0.0000e+00,  0.0000e+00, -0.0000e+00, -0.0000e+00,  0.0000e+00,
         1.1339e-01, -1.1821e+00,  0.0000e+00, -0.0000e+00,  5.9862e-01,
        -2.8574e-01, -9.8317e-01, -0.0000e+00, -1.2612e-01,  9.9614e-01,
         6.5999e-01,  0.0000e+00,  3.0590e-01, -5.4797e-01, -5.0351e-01,
         0.0000e+00, -0.0000e+00,  0.0000e+00, -0.0000e+00, -0.0000e+00,
        -8.8225e-01,  0.0000e+00, -1.0244e+00, -5.8098e-01,  3.0376e-01,
         0.0000e+00, -0.0000e+00, -0.0000e+00, -1.1822e+00, -0.0000e+00,
        -6.6956e-01, -0.0000e+00, -4.6374e-02, -7.3937e-02,  2.5804e-01,
        -0.0000e+00, -1.8591e-01, -2.8674e-01, -2.8