In [5]:
import pandas as pd
import torch
import numpy as np
import tqdm
from transformers import pipeline
from transformers import BertConfig
from transformers import BertModel
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import random
import os.path
from os import path

from transformers import *
from collections import Counter

import math
import csv
import tqdm

import re

In [6]:
# Check that PyTorch sees it
torch.cuda.is_available()


True

In [7]:
class bert_model():
    def __init__(self, name):
        self.name = name
        self.config = BertConfig(output_hidden_states=True)
        self.tokenizer = BertTokenizer.from_pretrained(name)
#         self.object = BertForMaskedLM.from_pretrained(name, config=self.config)
        self.object = BertModel.from_pretrained(name,
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

In [8]:
bert_base_cased= bert_model('bert-base-cased')
bert_large_cased = bert_model('bert-large-cased')

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-c

In [9]:
def get_id(keyword, tokenizer):    
    tokenized_text = tokenizer.tokenize("[CLS] " + keyword + " [SEP]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return indexed_tokens[1:-1]

In [10]:
def average_last_4(token_embeddings):
    token_vecs_cat = []

    for token in token_embeddings:
        #stack all into 2d array
        cat_vec = torch.stack((token[-1], token[-2], token[-3], token[-4]), dim=0)
        #take the average across columns
        cat_vec = torch.mean(cat_vec, 0)
        token_vecs_cat.append(cat_vec)
    return token_vecs_cat

In [11]:
def extract_word_embeddings_average_first_token(model, text, ids):    
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = model.tokenizer.tokenize(marked_text)
    indexed_tokens = model.tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model.object.eval()

    with torch.no_grad():
        outputs = model.object(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    
    #     WORD EMBEDDING
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_average_last_4 = average_last_4(token_embeddings)

    #find where keyword is
    word_embedding = []
    try:
        #find the sequence
        #take the embedding of the first item in the sequence
        index = [(i, i+len(ids)) for i in range(len(indexed_tokens)) if indexed_tokens[i:i+len(ids)] == ids][0][0]
        word_embedding = token_vecs_average_last_4[index]

    except:
        word_embedding = None
        #dataset has Albanian instead of just Albania
        print("Skip sentence")
        return None, None 
    
    #SENTENCE EMBEDDING
    token_vecs = hidden_states[-2][0]

#     Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return word_embedding, sentence_embedding 

In [12]:
def extract_word_embeddings_average_tokens(model, text, ids):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = model.tokenizer.tokenize(marked_text)
    indexed_tokens = model.tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model.object.eval()

    with torch.no_grad():
        outputs = model.object(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    
    #     WORD EMBEDDING
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_average_last_4 = average_last_4(token_embeddings)

    #find where keyword is
    try:
        index = [(i, i+len(ids)) for i in range(len(indexed_tokens)) if indexed_tokens[i:i+len(ids)] == ids][0][0]
    except:
        #dataset has Albanian instead of just Albania
        print("Skip sentence", text)
        return None, None 
    
    word_embedding = []
    for x in range(index, index+len(ids)):
        #concat all subtokens
        word_embedding.append(token_vecs_average_last_4[x])
    #stack them
    word_embedding = torch.stack(word_embedding, dim=0)
    #take the average of subtokens
    word_embedding = torch.mean(word_embedding, 0)
        
    #SENTENCE EMBEDDING
    token_vecs = hidden_states[-2][0]

#     Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return word_embedding, sentence_embedding 

In [13]:
def run_analysis(model, data, indices, ids):
    word_embeddings = []
    sentence_embeddings = []
    index_numbers = []
    for n, row in data.iterrows():
        word_embedding, sentence_embedding = extract_word_embeddings_average_tokens(model, row[0], ids)
        if (word_embedding is not None):
            word_embeddings.append(word_embedding)
            sentence_embeddings.append(sentence_embedding)
            index_numbers.append(indices[n])
            #if you found one, then you are good and can move on
#             break

    return word_embeddings, sentence_embeddings, index_numbers

# General Method of Creating Embeddings

In [14]:
def create_embeddings(filename, data, model):
    append_write = 'a' # make a new file if not

    with open(filename, append_write, newline='') as f:
        writer = csv.writer(f, delimiter=',')
        #change to keyword tag at times
        for name, group in data.groupby(['keyword']):
            #when group is not the same as keyword
            name_match = group['keyword'].values[0]
            ids = get_id(name_match, model.tokenizer)
            word_embeddings, sentence_embeddings, row_numbers = run_analysis(model, pd.DataFrame(group['sentence'].values), group['index'].values, ids)
            for word_embedding, sentence_embedding, row_number in zip(word_embeddings, sentence_embeddings, row_numbers):
                writer.writerow(word_embedding.tolist() + sentence_embedding.tolist() + [row_number] + [name] + [len(ids)])
            print(name)

# Artificial Context

In [15]:
import pandas as pd
a_sentences = pd.read_csv("../artificial_context/artifical_data.csv", header=None)
a_sentences.columns=['sentence']

country_data = pd.read_csv("../country_metadata/un_countries_meta.csv")
country_data

data = pd.DataFrame()
for country in country_data['Name'].values:
    temp = a_sentences.copy()
    temp['keyword'] = [country] * len(temp)
    temp['sentence'] = temp['sentence'].apply(lambda x: x.replace("COUNTRY", country))
    data = data.append(temp)
data = data.reset_index()
data

Unnamed: 0,index,sentence,keyword
0,0,I am from Afghanistan.,Afghanistan
1,1,I live in Afghanistan.,Afghanistan
2,2,I hope this January I will get to travel to Af...,Afghanistan
3,3,I am interesting in traveling to Afghanistan.,Afghanistan
4,4,My friend is from Afghanistan.,Afghanistan
...,...,...,...
3855,15,I never thought to visit Sao Tome and Principe...,Sao Tome and Principe
3856,16,The news says that Sao Tome and Principe is go...,Sao Tome and Principe
3857,17,The athlete from Sao Tome and Principe has jus...,Sao Tome and Principe
3858,18,The actress was born in Sao Tome and Principe ...,Sao Tome and Principe


In [16]:
create_embeddings("../embeddings/artificial_embeddings_bert_base_cased.csv", data, bert_base_cased)


Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Cape Verde
Central African Republic
Chad
Chile
China
Colombia
Comoros
Costa Rica
Cote d'Ivoire
Croatia
Cuba
Cyprus
Czechia
Democratic Republic of the Congo
Denmark
Djibouti
Dominica
Dominican Republic
East Timor
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Fiji
Finland
France
Gabon
Gambia
Georgia
Germany
Ghana
Greece
Grenada
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Kiribati
Kuwait
Kyrgyzstan
Laos
Latvia
Lebanon
Lesotho
Liberia
Libya
Liechtenstein
Lithuania
Luxembourg
Madagascar
Malawi
Malaysia
Maldives
Mali
Malta
Marshall Islands
Mauritania
Mauritius
Mexico

# Random sentences 30K

In [17]:
#Read data
import pandas as pd
data = pd.read_csv("country_sentences.csv", header=None)
data.columns = ['keyword', 'sentence']
data = data.reset_index()
data

FileNotFoundError: [Errno 2] File b'country_sentences.csv' does not exist: b'country_sentences.csv'

In [None]:
create_embeddings("country_names_first_embeddings.csv", data, bert_base_cased)