## BERT Embeddings: experimental

* Get 10,000 quotes with 'love' in the category column
* Do BERT embeddings
* See how well the embeddings perform in getting quotes that are similar in MEANING to the input text

In [51]:
import pandas as pd
import numpy as np
import time
import re

from transformers import BertTokenizer, BertModel
import torch

In [2]:
quotes_df = pd.read_csv('quotes_clean.csv')
quotes_df.head()

Unnamed: 0,quote,author,category,author_len
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak...",2
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love...",3
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s...",2
3,A friend is someone who knows all about you an...,Elbert Hubbard,"friend, friendship, knowledge, love",2
4,Darkness cannot drive out darkness: only light...,Martin Luther King Jr.,"darkness, drive-out, hate, inspirational, ligh...",4


### Helper Functions

In [14]:
def print_quote_by_index(idx, df):
    """
    Given the index number and the dataframe of quotes, prints the quote along 
    with the author.
    """
    quote = df.iloc[idx].quote
    author = df.iloc[idx].author
    print(f'"{quote}"')
    print(f' - {author}\n')


def build_tfidf_matrix(quotes_df):
    """
    Builds a matrix where each row corresponds to a quote, using TF-IDF vectorization.
    """
    corpus = quotes_df['quote'].tolist()
    vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)
    vectors = vectorizer.fit_transform(corpus)
    matrix = vectors.todense()

    # feature_names = vectorizer.get_feature_names_out()
    # list_dense = matrix.tolist()
    # df = pd.DataFrame(list_dense, columns=feature_names)
    # ^ use to view the matrix

    return matrix, vectorizer


def build_bert_matrix(quotes_df, tokenizer, model):
    """
    Builds a matrix where each row corresponds to a quote, using BERT vector embedding.
    """
    documents = quotes_df['quote'].to_list()

    # Encode documents
    matrix = np.vstack([encode_document(doc, tokenizer, model) for doc in documents])

    return matrix


def encode_document(doc, tokenizer, model):
    """
    Encode a single document (string) into a BERT embedding.
    """
    inputs = tokenizer(doc, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()


def find_closest_quotes(text_vect, matrix, n_closest):
    """
    Computes the distances between 'text_vect' (embedded user input) and each
    row (representing a embedded quote) in the matrix. Then,returns the row
    indices of the top 'n_closest' quotes from the matrix using cosine 
    similarity.
    """
    
    A = matrix
    x = text_vect.reshape((-1,1))
    
    temp = (np.sqrt((np.square(A)).sum(axis=1)) * np.linalg.norm(x))
    temp[temp == 0] = np.finfo(float).tiny
    temp = temp.reshape((-1,1)) # necessary to ensure A@x / temp is elementwise
    print('finished computing temp')
    
    distances = (1 - np.matmul(A,x) / temp).flatten()
    print('finished computing distances')
    
    sorted_indices = np.argsort(distances).tolist()
    print('finished sorting\n')
    
    return sorted_indices[:n_closest]

### Experiment on Love quotes

In [9]:
love_quotes = quotes_df[quotes_df['category'].str.contains(' love')].iloc[:10000]
love_quotes = love_quotes.reset_index(drop=True)
love_quotes

Unnamed: 0,quote,author,category,author_len
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak...",2
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love...",3
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s...",2
3,A friend is someone who knows all about you an...,Elbert Hubbard,"friend, friendship, knowledge, love",2
4,Darkness cannot drive out darkness: only light...,Martin Luther King Jr.,"darkness, drive-out, hate, inspirational, ligh...",4
...,...,...,...,...
9995,The only time I hold my wife's hand tightly is...,Matshona Dhliwayo,"funny-quotations, funny-quote, funny-quotes, h...",2
9996,Never judge an author by her haters.,Tracy Millosovich,"aturhot, author, authors-quote, haters-quote, ...",2
9997,Lovers are food for each other to sustain thei...,Munia Khan,"each, food, foods, life, lives, love, love-quo...",2
9998,"The world can be a great big scary place,and f...",Rachel C. Weingarten,"faith, faith-quotes, love, love-quotes",3


In [10]:
start = time.time()

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# matrix = build_bert_matrix(love_quotes, tokenizer, model)
# end = time.time() 

# print(f'Initialising the matrix for {matrix.shape[0]} quotes took {round(end - start, 2)} seconds.')

In [12]:
# Load saved matrix
matrix = np.load('love_quote_matrix', allow_pickle=True)
matrix.shape

(10000, 768)

In [24]:
love_quotes.iloc[149]['quote']

'What she had realized was that love was that moment when your heart was about to burst.'

In [49]:
NUM_QUOTES = 10

# sentence = input("What would you like the quote to say?")
sentence = "love is love is love is love is love is"
# 
# LENGTH OF INPUT MAKES A MASSIVE DIFFERENCE - shorter inputs result in shorter quotes that may or may not be close in meaning... 
# longer inputs (even just repeating the shorter input to make it longer) result in longer quotes, sometimes better for meaning
# 
# HOW DO WE MAKE THIS INDEPENDENT OF QUOTE LENGTH, AND PURELY BY MEANING?
# 
print(f'Received input: "{sentence}"')
print('\n----------------\n')

start = time.time()

text = sentence.lower()
text_vect = encode_document(text, tokenizer, model)
topN_indices = find_closest_quotes(text_vect, matrix, NUM_QUOTES)

for idx in topN_indices:
    print_quote_by_index(idx, love_quotes)

end = time.time()
duration = round(end - start, 2)
print('=======================================================')
print(f'That - with matrix operations - took {duration} seconds')
print(f'> Number of quotes: {matrix.shape[0]}')
print('=======================================================')

Received input: "love is love is love is love is love is"

----------------

finished computing temp
finished computing distances
finished sorting

"Love is black. Love is white. Love is brown. Love is humanity."
 - Matshona Dhliwayo

"Love is kind. Kind is love."
 - Lailah Gifty Akita

"God is love, as we love we become like God"
 - Lailah Gifty Akita

"Love is cure, love is power,love is magicof changes,love is the mirror of divine beauty"
 - Altaf ul qadri

"Love hopes. Love helps. Love heals. Love hears."
 - Matshona Dhliwayo

"Know love. Sow love. Grow love!"
 - Israelmore Ayivor

"Hate is... It's too easy. Love. Love takes courage."
 - Hannah Harrington

"See what love sees. Hear what love hears. Say what love says. Do what love does."
 - Matshona Dhliwayo

"The opposite of grief is not laughter or happiness or joy. It is love. It is love. It is love."
 - Akif Kichloo

"LOVE IS NOMADICAND I'M A GYPSY SOULSO LOVE GOES WHEREVER I GO "
 - Qwana Reynolds-Frasier

That - with matrix o

In [20]:
# Dump matrix for later
matrix.dump('love_quote_matrix')