# Imports & Environment

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import sentence_transformers
import openai

In [3]:
def identify_tensor_device():
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    return device


In [4]:
device = identify_tensor_device()
device

'mps'

In [128]:
#model = sentence_transformers.SentenceTransformer('msmarco-distilbert-base-v4', device=device)
#model = sentence_transformers.SentenceTransformer('msmarco-distilbert-base-v4', device=device)

# Searching the corpus


In [5]:
mr_archive = pd.read_csv('../output/mr_archive.csv')

In [6]:
mr_archive.head()

Unnamed: 0,title,author,publish_date,content,outlinks,tags,link,local_path
0,Self-punishment and incentives,Tyler Cowen,2013-03-30 07:03:53,\n[LeBron] James told me that when he was work...,['http://www.grantland.com/story/_/id/9109245/...,[],https://marginalrevolution.com/marginalrevolut...,/Users/jvm/Development/web_crawling/marginal_r...
1,Assorted links,Tyler Cowen,2013-03-30 13:21:30,\n1. On nitrogen fertilizer.\n2. Roberta Smith...,['http://www.inexactchange.org/blog/2013/03/28...,[],https://marginalrevolution.com/marginalrevolut...,/Users/jvm/Development/web_crawling/marginal_r...
2,A Brilliant New Method of Price Discrimination...,Alex Tabarrok,2013-03-30 12:32:59,"\nTo maximize profit, airlines want to charge ...",['http://www.worthpublishers.com/catalog/stati...,[],https://marginalrevolution.com/marginalrevolut...,/Users/jvm/Development/web_crawling/marginal_r...
3,*Simpler: The Future of Government*,Tyler Cowen,2013-03-31 18:52:19,\nThat is from Cass Sunstein (always worth rea...,['https://www.amazon.com/Simpler-Government-Ca...,[],https://marginalrevolution.com/marginalrevolut...,/Users/jvm/Development/web_crawling/marginal_r...
4,Assorted links,Tyler Cowen,2013-03-31 07:48:35,"\n1. Velocity maps.\n2. Carmina Burana, sung b...",['http://www.fastcoexist.com/1681677/a-new-map...,[],https://marginalrevolution.com/marginalrevolut...,/Users/jvm/Development/web_crawling/marginal_r...


In [7]:
#model = sentence_transformers.SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
model = sentence_transformers.SentenceTransformer('all-MiniLM-L12-v2', device=device)


In [8]:
embedding_test_unbatched = model.encode('alpha', show_progress_bar=True)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
embedding_test_batched = model.encode(['alpha', 'square', 'anonymity'], show_progress_bar=True)


In [176]:
print('Self-match:', sentence_transformers.util.dot_score(embedding_test_unbatched, embedding_test_batched[0]))
print('Other-match:', sentence_transformers.util.dot_score(embedding_test_unbatched, embedding_test_batched[1]))
print('Other-match:', sentence_transformers.util.dot_score(embedding_test_unbatched, embedding_test_batched[2]))

Self-match: tensor([[0.9914]])
Other-match: tensor([[0.1675]])
Other-match: tensor([[0.1760]])


In [11]:
corpus_embeddings = torch.tensor(torch.load('../output/mr_embeddings.pt')).to(device)

In [12]:
print(concatenate_title_text(mr_archive.iloc[test_index]))

Title: Self-punishment and incentives
    Tags: [, ]
    Content: 
[LeBron] James told me that when he was working on his 3s, he’d punish himself until he met a lofty set of self-enforced shooting milestones.
“It’s work,” James says. “It’s a lot of work. It’s being in workouts, and not accomplishing your goal, and paying for it. So, if I get to a spot in a workout and want to make eight out of 10, if I don’t make eight of 10, then I run. I push myself to the point of exhaustion until I make that goal. So you build up that mentality that you got to make that shot and then use that in a game situation — it’s the ultimate feeling, when you’re able to work on something and implement it.”
Here is more, all of it focused on how LeBron James improved his game.



In [100]:
model.encode(concatenate_title_text(mr_archive.iloc[test_index]))

array([ 4.62842137e-01,  4.17626470e-01, -2.56969243e-01,  6.38134703e-02,
        7.91084617e-02,  3.65754992e-01,  5.70676625e-01, -5.32728553e-01,
       -3.22708815e-01,  8.84878188e-02,  2.23417342e-01,  6.02718532e-01,
        5.37983030e-02, -1.25656009e-01,  1.65926144e-01, -2.63852686e-01,
        5.65725803e-01, -1.90581173e-01,  4.99700397e-01, -6.15747154e-01,
       -2.02796027e-01,  1.13386166e+00,  2.06236050e-01,  1.88513979e-01,
       -3.05607796e-01, -3.37160438e-01,  2.45144531e-01, -1.97676182e-01,
       -6.48346364e-01,  2.77979344e-01,  2.04377010e-01, -3.49896550e-01,
        2.18025595e-01, -3.41442049e-01, -3.91747534e-01, -5.32011151e-01,
        1.00993842e-01, -9.40842852e-02,  8.64315778e-02, -4.75268453e-01,
        2.14811727e-01,  1.93131611e-01,  7.42075890e-02,  1.89245433e-01,
        1.16290501e-03,  3.29866379e-01, -1.67530492e-01, -8.22877884e-02,
        3.44126858e-02, -1.43611237e-01, -4.16159958e-01,  1.64211646e-01,
       -5.71306884e-01, -

In [13]:
# Sanity check test
# This should be close to zero

def concatenate_title_text(row):
    return f'''Title: {row["title"]}
    Tags: {', '.join(row["tags"])}
    Content: {row["content"]}'''

test_index = 0
np.sum(np.abs(model.encode(concatenate_title_text(mr_archive.iloc[test_index])) - corpus_embeddings[test_index].cpu().numpy()))


0.0

In [10]:
corpus_embeddings.shape

(3974, 768)

In [11]:
query_embedding = model.encode('a', convert_to_tensor=True)

In [25]:
cos_scores = sentence_transformers.util.cos_sim(query_embedding, corpus_embeddings)[0]

In [14]:
def search_mr_for_query(query):
    query_embedding = model.encode(query, convert_to_tensor=True)

    cos_scores = sentence_transformers.util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=5)
    
    return top_results

### Reasonably good queries
* Why did Brexit happen? (2nd hit is Why Brexit happened and what it means)
* Should we believe in the recent reports about UFOs?
* Does the minimum wage cause disemployment?

### Bad-ish

* What are the best Austrian novels?

### Bad
* Why did Europe diverge from the rest of the world?

In [15]:
query = 'Russian beliefs about Ukraine'
results = search_mr_for_query(query)

In [16]:
results

  nonzero_finite_vals = torch.masked_select(tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0))


torch.return_types.topk(
values=tensor([0.6849, 0.6422, 0.5945, 0.5474, 0.5137], device='mps:0'),
indices=tensor([471, 486, 520, 430, 423], device='mps:0'))

In [17]:
results.indices[0].item()

471

In [18]:
for i in range(len(results[0])):
    print(f'Result #{i+1}')
    print(f'Score: {results.values[i]}')
    hit = mr_archive.iloc[results.indices[i].item()]
    print(f'Title: {hit["title"]}')
    print(f'Author: {hit["author"]}')
    print(f'Date: {hit["publish_date"]}')
    print(f'Content: {hit["content"]}')
    print(f'Link: {hit["link"]}')
    print()
    #result.indices[0]
    #mr_archive.iloc[1939]

Result #1
Score: 0.6849457025527954
Title: It isn’t just Putin — Russia vs. Ukraine
Author: Tyler Cowen
Date: 2022-04-18 13:57:55
Content: 
From Wikipedia, here is a description of the views of Nobel Laureate Joseph Brodsky on Ukraine:
According to many historians, despite the fact that Brodsky had anti-Soviet views, for which he was eventually forced to leave Soviet Russia and emigrate to the United States, he, with all that, had pronounced Russian-imperial views, which resulted in his rejection of the existence of Ukrainians as a nation separate from Russians. According to Russian literary critic and biographer and friend of Brodsky Lev Losev, Brodsky considered Ukraine “the only cultural space with Great Russia”, and the Polish historian Irena Grudzinska-Gross [pl] in her book “Milosz and Brodsky” (2007) Brodsky firmly believed that Ukraine and has always been “an integral part of Great Russia”. According to Grudzinskaya-Gross, “Brodsky’s Russian patriotism is also evidenced by … th

In [19]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [20]:
def get_gpt2_token_count(text):
    return np.sum(tokenizer(text)['attention_mask'])

In [46]:
def encode_hit_for_gpt3(hit):
    return f"""Title: {hit["title"]}
    Tags: {hit["tags"]}
    Content: {hit["content"]}
    """
def generate_query_for_gpt3(hits, query):
    hits_encoded = [encode_hit_for_gpt3(hit) for hit in hits]
    token_counts = [get_gpt2_token_count(hit) for hit in hits_encoded]
    if np.sum(token_counts) > 3400:
        assert False, f"Probably too many tokens: {np.sum(token_counts)}"
    article_prompt = f"""A user queried: '{query}'
    
    Please read the following articles and respond truthfully.
    
    """
    for i, hit in enumerate(hits_encoded):
        article_prompt += f"""
        Article number {i+1}:
        
        {hit}
        """
    
    article_prompt += f"""
    In detail, what insights do these articles have about the query, '{query}'?"""
    return article_prompt
    

In [47]:
hits = [mr_archive.iloc[ix.item()] for ix in results.indices]
gpt3_prompt = generate_query_for_gpt3(hits[:3], query)

In [48]:
get_gpt2_token_count(gpt3_prompt)

3482

In [49]:
gpt3_prompt

"A user queried: 'Russian beliefs about Ukraine'\n    \n    Please read the following articles and respond truthfully.\n    \n    \n        Article number 1:\n        \n        Title: It isn’t just Putin — Russia vs. Ukraine\n    Tags: []\n    Content: \nFrom Wikipedia, here is a description of the views of Nobel Laureate Joseph Brodsky on Ukraine:\nAccording to many historians, despite the fact that Brodsky had anti-Soviet views, for which he was eventually forced to leave Soviet Russia and emigrate to the United States, he, with all that, had pronounced Russian-imperial views, which resulted in his rejection of the existence of Ukrainians as a nation separate from Russians. According to Russian literary critic and biographer and friend of Brodsky Lev Losev, Brodsky considered Ukraine “the only cultural space with Great Russia”, and the Polish historian Irena Grudzinska-Gross [pl] in her book “Milosz and Brodsky” (2007) Brodsky firmly believed that Ukraine and has always been “an inte

In [50]:
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")


In [54]:
response = openai.Completion.create(model="text-davinci-002", prompt=gpt3_prompt, temperature=.5, max_tokens=500)


In [55]:
response

<OpenAIObject text_completion id=cmpl-5w34oXygSnqEcr6500604dnrqEKZF at 0x2b2e723b0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n    \n    \n    Article 1:\n    \n    The article discusses the views of Nobel Laureate Joseph Brodsky on Ukraine. Brodsky believed that Ukraine was not a separate nation from Russia, and that the independence of Ukraine was a mistake. He also thought that Ukraine should be part of Russia.\n    \n    Article 2:\n    \n    The article discusses how the IR community got Russia/Ukraine so wrong. It argues that the community did not foresee the war in Ukraine because they did not understand the historical continuity and persistence of Russian involvement in Ukraine.\n    \n    Article 3:\n    \n    The article discusses how Putin's beliefs about history justify Russian expansionism. It argues that Putin's ideas about history are a continuation of a longstanding tradition in Russian leadersh

In [56]:
response['choices'][0]['text']

"\n    \n    \n    Article 1:\n    \n    The article discusses the views of Nobel Laureate Joseph Brodsky on Ukraine. Brodsky believed that Ukraine was not a separate nation from Russia, and that the independence of Ukraine was a mistake. He also thought that Ukraine should be part of Russia.\n    \n    Article 2:\n    \n    The article discusses how the IR community got Russia/Ukraine so wrong. It argues that the community did not foresee the war in Ukraine because they did not understand the historical continuity and persistence of Russian involvement in Ukraine.\n    \n    Article 3:\n    \n    The article discusses how Putin's beliefs about history justify Russian expansionism. It argues that Putin's ideas about history are a continuation of a longstanding tradition in Russian leadership."