In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cpu
random seed: 1234


In [3]:
def read_data(filename):
    data_text = []
    file = open(filename)
    for line in file:
        line_list = line.strip()
        data_text.append(line_list)
    return data_text




In [4]:
df = read_data("dataset.txt")

In [5]:
from collections import defaultdict

# convert to dict for token storage
tokens_emb = defaultdict(list)

In [6]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("microsoft/deberta-v3-base")
from transformers import DebertaTokenizerFast

tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")  #using fast to speed up

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
#split the data into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.95, random_state=seed)


In [25]:
print(len(df))
print(len(train))

4468825
223441


In [None]:
batch_size = 24 #to make faster
for i in range(0,len(train),batch_size):
    batch = train[i:i+batch_size]

    #tokenize
    encoding = tokenizer(batch, padding = True, truncation = True, return_tensors='pt') #return as pt otherwise it throws an error about no size() for list
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    input_ids.to(device)
    attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids,attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

    #print(hidden_states)

    for i in range(hidden_states.size(0)):
        input_id_2 = input_ids[i]
        emb = hidden_states[i]
        tokens = tokenizer.convert_ids_to_tokens(input_id_2) #convert to words

    for j, word in enumerate(tokens):
        embedding = emb[j].cpu().numpy() #send back to cpu, otherwise will give error
        tokens_emb[word].append(embedding)
        

In [14]:
#average embeddings
avg_emb =  {}
for k,v in tokens_emb.items():
    emb_arr = np.array(v)
    mean_emb = np.mean(emb_arr)
    avg_emb[k] = mean_emb

In [15]:
print(avg_emb)

{'[CLS]': np.float32(8.155645e-05), 'American': np.float32(-0.006403081), 'Ġsilent': np.float32(-0.007398194), 'Ġfeature': np.float32(0.0034562114), 'Ġfilms': np.float32(0.0057159574), '[SEP]': np.float32(0.0015015276), '[PAD]': np.float32(-0.0021148787), 'It': np.float32(0.03697178), 'Ġcan': np.float32(-0.015007238), 'Ġalso': np.float32(-0.0063433372), 'Ġbe': np.float32(-0.0063217334), 'Ġclassified': np.float32(0.011430574), 'Ġby': np.float32(-0.015195114), 'Ġvegetation': np.float32(-7.701168e-05), 'Ġtype': np.float32(-0.0005515839), ',': np.float32(-0.020832261), 'Ġe': np.float32(-0.008898318), '.': np.float32(-0.001643011), 'g': np.float32(0.0030902065), 'Canadian': np.float32(0.04604734), 'ĠPra': np.float32(-0.021137645), 'iries': np.float32(0.009289382), 'Ste': np.float32(0.006514415), 'ppe': np.float32(-0.010073226), 'ĠRoute': np.float32(-0.04480545), 'Pl': np.float32(0.021244263), 'ains': np.float32(0.004047707), 'Roman': np.float32(0.024429658), 'ian': np.float32(-0.012628283),

Problem 2

In [16]:
glove_df = read_data('glove.6B.300d-vocabulary.txt')


In [20]:
def most_similar_words(word,emb,topn=10):
    if word not in emb:
        print("Word not in embeddings!")
        return
    else:
        word_emb = emb[word]
        similar = {}
        for k,v in emb.items():
            if  k != word:
                sim = np.dot(word_emb,v) / (norm(word_emb)*norm(v)) #normalize
                similar[k] = sim
        similar_sort = sorted(similar.items(), key=lambda item: item[1], reverse=True)
        similar_list = []
        for k,v in similar_sort.items():
            for i in range(0,topn):
                similar_list.append(tuple([k,similar_sort[k]]))
        return similar_list
                

In [None]:
print(most_similar_words('cactus',avg_emb,10))
print(most_similar_words('cake',avg_emb,10))
print(most_similar_words('angry',avg_emb,10))
print(most_similar_words('quickly',avg_emb,10))
print(most_similar_words('between',avg_emb,10))
print(most_similar_words('the',avg_emb,10))