In [2]:
import json
import re
import urllib.parse
from urllib.request import urlopen
import pickle
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import json
import html2text
from bs4 import BeautifulSoup
import requests

Code for Wikipedia exploration

In [89]:
'''
Pulls the internal wiki links out of a section of text 
Returns clean text along with the link mappings
'''
def link_extractor(text):
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', text)
    # wiki_cleaner encodes html or something, need to match on decoded (encoded?) brackets
    cleaned_text = re.sub('(&lt;).*?(&gt;)', '', text)
    return cleaned_text, [urllib.parse.unquote(url).lower() for url in urls]



In [87]:
# output of wiki_cleaner.sh
# download wiki dump, run wiki_cleaner.sh, then run this on the output
data_path = "data/enwiki-20220301-pages-articles-multistream1.txt"
cleaned_data = {}
with open(data_path, 'r') as f:
    for i,line in enumerate(f):
        json_line = json.loads(line)
        title = json_line['title'].lower()
        cleaned_data[title] = []
        split_text = json_line['text'].split('\n')
        for text in split_text:
            paragraph_text, urls = link_extractor(text)
            cleaned_data[title].append({'paragraph': paragraph_text, 'links': urls})
pickle.dump(cleaned_data, open('data/paragraphs_and_links.pkl', 'wb'))


In [4]:
'''
coverage is a map from each title to where it appears in the text
    coverage['title] is list of (title, paragraph) tuples
cleaned_data is a map from each title to its article content and outgoing links
    cleaned_data[title] is list of {paragraph: str, links: []} tuples
'''
cleaned_data = pickle.load(open('data/paragraphs_and_links.pkl', 'rb'))
coverage = {x: [] for x in cleaned_data.keys()}
for title in cleaned_data:
    for i,paragraph in enumerate(cleaned_data[title]):
        for link in paragraph['links']:
            if link in coverage:
                coverage[link] += (title,i)
pickle.dump(coverage, open('data/coverage.pkl', 'wb'))

Code for Reddit exploration

In [4]:
'''
Builds comments_by_post
'''
# collect all comments by post
comments_by_post = {}
with open('../data/reddit/RC_2009-05', 'r') as f:
    for line in f:
        d = json.loads(line)
        link_id = d['link_id']
        if link_id not in comments_by_post:
            comments_by_post[link_id] = []
        d['body'] = html2text.html2text(d['body'])
        comments_by_post[link_id].append(d)
for key in list(comments_by_post.keys()):
    if len(comments_by_post[key]) < 5:
        del comments_by_post[key]

In [19]:
def collect_ancestors(posts, post_id, source_id):
    ancestors = []
    exists_parent = True
    while exists_parent:
        exists_parent = False
        
        if post_id[:2] == 't3': # refers to a link (top-level comment)
            # if there is a source, then we only want to return ancestors if a source is in the path
            if source_id is not None:
                return []
            else:
                return ancestors[::-1]

        if post_id[:2] == 't1':
            post_id = post_id[3:]

        if post_id == source_id:
            ancestors.append(source_id)
            return ancestors[::-1]

        for post in posts:
            if post['id'] == post_id:
                ancestors.append(post_id)
                post_id = post['parent_id']
                exists_parent=True
    return []

    

In [3]:
# build an inverted index for reddit comments
reddit_inverted_index = {}
for key in comments_by_post:
    for comment in comments_by_post[key]:
        body = comment['body']
        body = re.sub('[^A-Za-z0-9]', ' ', body.lower()).split(' ')
        for word in body:
            if len(word) > 2:
                if word not in reddit_inverted_index:
                    reddit_inverted_index[word] = []
                reddit_inverted_index[word].append({'post_id': key, 'comment_id': comment['id']})


In [17]:
x = reddit_inverted_index['http'][7]
for comment in comments_by_post[x['post_id']]:
    if comment['id'] == x['comment_id']:
        print(comment['body'])

Thanatos is the ancient Greek personification of death (Mors is the Latin
equivalent) according to Wikipedia - http://en.wikipedia.org/wiki/Thanatos




In [49]:
def find_possible_candidate_chains(w1, w2):
    w1_comments = reddit_inverted_index.get(w1, [])
    w2_comments = reddit_inverted_index.get(w2, [])

    possible_candidates = []

    for c1 in w1_comments:
        for c2 in w2_comments:
            if c1['post_id'] == c2['post_id']:
                new_addition = {'post_id': c1['post_id'], 'source': c1['comment_id'], 'target': c2['comment_id']}
                if new_addition not in possible_candidates:
                    possible_candidates.append(new_addition)
    return possible_candidates

def valid_chain_check(candidate):
    ancestors = collect_ancestors(comments_by_post[candidate['post_id']], candidate['target'], candidate['source'])
    return ancestors
    if candidate['source'] in ancestors:
        return ancestors

In [59]:
c = find_possible_candidate_chains('sackler', 'money')
for chain in c:
    v = valid_chain_check(chain)
    if len(v) > 5:
        print(chain, v)
        for ancestor in v:
            for post in comments_by_post[chain['post_id']]:
                if post['id'] == ancestor:
                    print(post['body'])

In [98]:
'''
To make the data set:

three paragraphs start, target, hidden
f(start, target) --> encoding most similar to hidden

f(start paragraph, target paragraph) --> encoding most similar to the paragraphs above the target paragraph 
- should really be by section?

1. Pick a start paragraph (that is relatively well-covered)
2. Pick a target paragraph from one of the covered links' articles
3. Encode the start paragraph, the target paragraph, and a paragraph right above the target paragraph in the same article
4. Train model to minimize the difference of f(.,.) with the above-target paragraph encoding

Maybe first look at basic measures
1. Make progress towards target (controlled by threshold)
2. Minimize introduction of unnecessary information (compactness)
'''

"\nTo make the data set:\n\nf(start paragraph, target paragraph) --> encoding most similar to the first paragraph of the target paragraph \n- should really be by section?\n\n1. Pick a start paragraph (that is relatively well-covered)\n2. Pick a target paragraph from one of the covered links' articles\n3. Encode the start paragraph, the target paragraph, and the first paragraph of the target article\n4. Train model to minimize the difference of f(.,.) with the first paragraph encoding\n"

In [57]:
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
bert_model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
sbert_model = SentenceTransformer('all-MiniLM-L12-v2')

In [124]:
candidates = [x['paragraph'] for title in cleaned_data for x in cleaned_data[title] if x['paragraph'] != ''][:10000]
candidate_encodings = sbert_model.encode(candidates, convert_to_tensor=True)

In [None]:
"""
Basic paragraph interpolation does not seem to work. Suspect that the distance is too vast. Motivates a stronger constraint on individual words
"""

START_IDX = 0
END_IDX = 100 
CHAIN_LENGTH = 10

for i in range(CHAIN_LENGTH):
    interpolated = candidate_encodings[START_IDX] * (5-i) + candidate_encodings[END_IDX] * i
    cosine_scores = util.cos_sim(interpolated, candidate_encodings)
    max_idx = torch.topk(cosine_scores, 2)[1][0][1]
    print(candidates[max_idx])
print(candidates[END_IDX])

In [None]:
'''
Can try looking for the most similar paragraphs in the direction of the target
E.g: source --> n most similar --> choose 1 that is most similar to target --> repeat
'''
START_IDX = 0
END_IDX = 100
cur_source = candidate_encodings[START_IDX]
target = candidate_encodings[END_IDX]
print(candidates[START_IDX])
for i in range(2):
    source_scores = util.cos_sim(cur_source, candidate_encodings)
    max_idx = torch.squeeze(torch.topk(source_scores, 10)[1], 0)
    #print(max_idx)

    max_candidates = torch.index_select(candidate_encodings, 0, max_idx)
    target_scores = util.cos_sim(target, max_candidates)

    max_idx_target = torch.squeeze(torch.topk(target_scores, 1)[1],0)
    #print(max_idx_target)

    print(candidates[max_idx[max_idx_target]])
    cur_source = candidate_encodings[max_idx[max_idx_target]]
print(candidates[END_IDX])    

Code for Reddit URL exploration

In [14]:
all_urls = []
keywords = ['cnn', 'fox', 'nytimes']

for key in comments_by_post:
    for post in comments_by_post[key]:
        # first check if post body contains URL
        urls = re.findall(r'(https?://\S+)', post['body'])
        for url in urls:
            for keyword in keywords:
                if keyword in url:
                    all_urls.append({'post_id': key, 'comment_id': post['id'], 'url': url})

In [17]:
def scrape(url):
    try:
        html = urlopen(url).read()
    except:
        return False
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

In [27]:
'''
load reddit comment data
loop through comments
    if url,
        get ancestor pointers

for all url blocks
    extract text from html
    
train retrieval cross encoder model to predict likelihood of a URL being posted in a comment (or the next comment)
    input: comment chain, URL text + metadata (domain, title)
    output: [0-1] match

can model be used to build big semantic graph? "where to go next"
        
'''




cur = all_urls[0]
print(cur)
print(comments_by_post[cur['post_id']])

#scrapped_text = scrape(cur['url'])
#print(scrapped_text)

ancestors = collect_ancestors(comments_by_post[cur['post_id']], cur['comment_id'], None)
print(ancestors)

for ancestor in ancestors:
        for post in comments_by_post[cur['post_id']]:
                    if cur['comment_id'] == ancestor:
                        print(post['body'])

{'post_id': 't3_8guzj', 'comment_id': 'c098njp', 'url': 'http://select.nytimes.com/gst/abstract.html?res=F00A11FE345C177B93C0A8178AD85F4D8485F9'}
[{'author': '[deleted]', 'ups': -1, 'subreddit': 'worldnews', 'downs': 0, 'gilded': 0, 'score_hidden': False, 'author_flair_css_class': None, 'distinguished': None, 'author_flair_text': None, 'body': '\\+\nhttp://pqasb.pqarchiver.com/latimes/access/521670312.html?dids=521670312:521670312&FMT=CITE&FMTS=CITE:AI&date=May+20%2C+1967&author=&pub=Los+Angeles+Times&desc=Israel+Warns+on+U.N.+Pullout&pqatl=google\n\\+\nhttp://select.nytimes.com/gst/abstract.html?res=F00A11FE345C177B93C0A8178AD85F4D8485F9\n\\+\nhttp://select.nytimes.com/gst/abstract.html?res=F60F13FE355D13728DDDAC0894D0405B8889F1D3\n\\+\nhttp://pqasb.pqarchiver.com/chicagotribune/access/508111992.html?dids=508111992:508111992&FMT=CITE&FMTS=CITE:AI&date=Apr+07%2C+1955&author=&pub=Chicago+Tribune&desc=ISRAEL+WARNS+U.+N.+ARMISTICE+MAY+BE+BROKEN&pqatl=google\n\\+\nhttp://select.nytimes.com