In [1]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from itertools import islice
from tools import Dictionary
from glove import GloVeModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
stopwords = nltk.corpus.stopwords.words('english')

def clear_stopwords(tokenized_text: list[str], stopwords) -> list[str]:
    return [w for w in tokenized_text if w not in stopwords]

def clear_special_characters(tokenized_text: list[str]) -> list[str]:
    return [re.sub(r"[^A-Za-z]+",'', text) for text in tokenized_text]

def preprocess(text: str) -> list[str]:
    tokenized_text = word_tokenize(text)
    tokenized_text = clear_stopwords(tokenized_text, stopwords)
    tokenized_text = clear_special_characters(tokenized_text)
    return " ".join([letter for letter in tokenized_text if len(letter)>2]).strip().lower()

In [3]:
def show_n(n, iterable):
    return list(islice(iterable, n))

In [4]:
df = pd.read_csv('./data.csv')

In [5]:
df.head()

Unnamed: 0,title,label
0,20 BuzzFeed Articles It's Probably OK That I N...,clickbait
1,Pigskin A Blanket: NFL Conference Championship...,clickbait
2,When Relationship Introductions Get Awkward,clickbait
3,You Might Be Food Shopping Wrong,clickbait
4,27 Underrated Makeup Brands Everyone Should Kn...,clickbait


In [6]:
cleaned_sentences = df.apply(lambda x: preprocess(x["title"]), axis=1)

In [7]:
show_n(5, cleaned_sentences)

['buzzfeed articles probably that never finished',
 'pigskin blanket nfl conference championship picks',
 'when relationship introductions get awkward',
 'you might food shopping wrong',
 'underrated makeup brands everyone should know about']

In [8]:
tokenized_sentences = [word_tokenize(i) for i in cleaned_sentences]

In [9]:
show_n(5, tokenized_sentences)

[['buzzfeed', 'articles', 'probably', 'that', 'never', 'finished'],
 ['pigskin', 'blanket', 'nfl', 'conference', 'championship', 'picks'],
 ['when', 'relationship', 'introductions', 'get', 'awkward'],
 ['you', 'might', 'food', 'shopping', 'wrong'],
 ['underrated', 'makeup', 'brands', 'everyone', 'should', 'know', 'about']]

In [10]:
dct = Dictionary(tokenized_sentences)
corpus = dct.corpus(tokenized_sentences)

In [11]:
dct.vocab_size

305

In [12]:
show_n(10, dct.word2idx.items())

[('brands', 0),
 ('restore', 1),
 ('ted', 2),
 ('bell', 3),
 ('reasons', 4),
 ('why', 5),
 ('demonstrate', 6),
 ('dies', 7),
 ('united', 8),
 ('never', 9)]

In [13]:
EMBEDDING_SIZE = 128
CONTEXT_SIZE = 3
NUM_EPOCH = 10
LEARNING_RATE = 0.01


In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = GloVeModel(EMBEDDING_SIZE, CONTEXT_SIZE, dct.vocab_size)
model.to(device)

GloVeModel(
  (_focal_embeddings): Embedding(305, 128)
  (_context_embeddings): Embedding(305, 128)
  (_focal_biases): Embedding(305, 1)
  (_context_biases): Embedding(305, 1)
)

In [15]:
model.fit(corpus)

In [16]:
cooccurance_matrix = model.get_coocurrance_matrix()
# model.train(NUM_EPOCH, device, learning_rate=LEARNING_RATE)

In [17]:
cooccurance_matrix[:10]

[(156, 89, 1.0),
 (156, 103, 0.5),
 (156, 259, 0.3333333333333333),
 (89, 156, 1.0),
 (89, 103, 1.0),
 (89, 259, 0.5),
 (89, 9, 0.3333333333333333),
 (103, 89, 1.0),
 (103, 156, 0.5),
 (103, 259, 1.0)]

In [18]:
def show_occurense(tple):
    print(f"{dct.idx2word[tple[0]]} - {dct.idx2word[tple[1]]} : {tple[2]}")

In [19]:
show_occurense(cooccurance_matrix[2])

buzzfeed - that : 0.3333333333333333


In [20]:
model.train(NUM_EPOCH, device, learning_rate=LEARNING_RATE)

epoch: 0, current step: 0, average loss: 0.03359546565690181
epoch: 1, current step: 0, average loss: 0.11625027091995302
epoch: 2, current step: 0, average loss: 0.0586626114840594
epoch: 3, current step: 0, average loss: 0.027321591854541964
epoch: 4, current step: 0, average loss: 0.013604911097377279
epoch: 5, current step: 0, average loss: 0.009198354894086585
epoch: 6, current step: 0, average loss: 0.008171694213980304
epoch: 7, current step: 0, average loss: 0.007540590663656986
epoch: 8, current step: 0, average loss: 0.006397906132683713
epoch: 9, current step: 0, average loss: 0.005010491542081308
finish glove vector training


In [21]:
embeddings = list(model._focal_embeddings.parameters())[0]

In [22]:
man_vec  = embeddings[dct.word2idx['man']]
woman_vec = embeddings[dct.word2idx['woman']]

In [23]:
torch.cosine_similarity(man_vec.view(1,-1), woman_vec.view(1,-1)).item()

0.07057671568306412