In [14]:
from language_processing import *
import json
import pprint as pp
import pandas as pd
import re
import nltk
import inflect
from nltk.corpus import wordnet
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from lxml import etree
import requests

In [15]:
p = inflect.engine()
load_dotenv()
nltk.download('punkt');
nltk.download('averaged_perceptron_tagger');
nltk.download('wordnet');
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/vonbecker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# load article lists

with open('data/articles/advanced.json') as file:
    articles_adv = json.load(file)

with open('data/articles/intermediate.json') as file:
    articles_int = json.load(file)

with open('data/articles/concise.json') as file:
    articles_con = json.load(file)

In [60]:
# remove articles which have an empty 'title' field

for article in articles_adv:
    if article['title'] == '':
        articles_adv.remove(article)

for article in articles_int:
    if article['title'] == '':
        articles_int.remove(article)

for article in articles_con:
    if article['title'] == '':
        articles_con.remove(article)

pp.pprint(articles_adv)

[{'articleId': 8,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Rosencrantz and Guildenstern'},
 {'articleId': 20,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Othello'},
 {'articleId': 21,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Othello'},
 {'articleId': 30,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Much Ado About Nothing'},
 {'articleId': 37,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Oberon'},
 {'articleId': 41,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'King John'},
 {'articleId': 42,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Children of Pauls'},
 {'articleId': 43,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': 'Measure for Measure'},
 {'articleId': 44,
  'articleTypeId': 1,
  'lastUpdated': '2021-10-15',
  'title': 'Richard II'},
 {'articleId': 64,
  'articleTypeId': 1,
  'lastUpdated': '2021-06-30',
  'title': '

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
# load search terms

with open('data/search_terms/primary_and_related.json') as file:
    all_search_terms = json.load(file)

for search_term in all_search_terms:
    search_term['related'] = [[word for word in phrase] for phrase in search_term['related']]

primary = pd.read_csv('data/search_terms/primary.csv', header=0).squeeze('columns').tolist()

primary_clean = []
for item in primary:
    parsed = item.strip('[]')
    tuple_pattern = r"\('(\w+)',\s'([A-Z]+)'\)"
    tuples = re.findall(tuple_pattern, parsed)
    primary_clean.append(tuples)

related = pd.read_csv('data/search_terms/related.csv', header=0).squeeze('columns').tolist()

related_clean = []
for item in related:
    parsed = item.strip('[]')
    tuple_pattern = r"\('(\w+)',\s'([A-Z]+)'\)"
    tuples = re.findall(tuple_pattern, parsed)
    related_clean.append(tuples)

pp.pprint(all_search_terms)

[{'primary': [['natural', 'JJ'], ['frequency', 'NN']],
  'related': [[['frequency', 'NN']], [['natural', 'JJ']]],
  'text': 'natural frequency'},
 {'primary': [['electric', 'JJ'],
              ['fields', 'NNS'],
              ['and', 'CC'],
              ['forces', 'NNS']],
  'related': [[['field', 'NN']],
              [['force', 'NN']],
              [['electric', 'JJ']],
              [['electric', 'JJ'], ['field', 'NN']],
              [['electric', 'JJ'], ['force', 'NN']]],
  'text': 'electric fields and forces'},
 {'primary': [['accelerated', 'VBN'],
              ['gravitational', 'JJ'],
              ['motion', 'NN']],
  'related': [[['motion', 'NN']],
              [['accelerated', 'VBN']],
              [['gravitational', 'JJ']],
              [['gravitational', 'JJ'], ['motion', 'NN']],
              [['accelerated', 'VBN'], ['motion', 'NN']]],
  'text': 'accelerated gravitational motion'},
 {'primary': [['afterimage', 'NN']], 'related': [], 'text': 'afterimage'},
 {'primar

In [6]:
def get_text(phrase):
    text = ' '.join(word[0] for word in phrase)
    return text

def get_pos(word):
    tag = word[1]
    if tag.startswith('N'):
        label = 'n'
    elif tag.startswith('V'):
        label = 'v'
    elif tag.startswith('J'):
        label = 'a'
    elif tag.startswith('R'):
        label = 'r'
    else:
        label = 'z'
    return label

In [7]:
# get synonyms for a given tagged word

def get_syns(word):
    text = get_text([word])
    label = get_pos(word)

    syns = wordnet.synsets(text)

    these_syns = []
    for syn in syns:
        if syn in these_syns:
            continue
        if syn.pos() == label:
            these_syns.append(syn)

    syns_flat = [item for sublist in [syn.lemma_names() for syn in these_syns] for item in sublist]

    syns_flat_unique = []
    [syns_flat_unique.append(item) for item in syns_flat if item not in syns_flat_unique]

    all_syns = [syn.replace('_', ' ') for syn in syns_flat_unique]

    non_triv_syns = [syn for syn in all_syns if syn != text]
    return non_triv_syns

In [56]:
# get similar search phrases using synonyms

def get_similar(search_phrase):
    num_words = len(search_phrase)
    sim_phrases = []
    for i in range(num_words):
        phrase_text = get_text(search_phrase)
        phrase_text_split = phrase_text.split()
        syns = get_syns(search_phrase[i])
        # print(syns)
        syns_unique = []
        [syns_unique.append(x) for x in syns if x not in syns_unique]
        for j in range(len(syns_unique)):
            sim_phrase = phrase_text_split
            sim_phrase[i] = syns_unique[j]
            sim_phrase_text = ' '.join(sim_phrase)
            sim_phrases.append(sim_phrase_text)
    return sim_phrases

In [9]:
# get the britannica artitle titles listed under topic = search_text
# www.britannica.com/topic/search-text

def get_britannica_topic_titles(search_text):
    search_text = '-'.join(search_text.split())
    url = 'https://www.britannica.com/topic/{}'.format(search_text)
    webpage = requests.get(url)
    soup = BeautifulSoup(webpage.content, "html.parser")
    content = etree.HTML(str(soup))
    titles = content.xpath('//section[@class="index-entries"]//span[@class="index-xref"]/a/text()')
    titles = [title.split(': ')[0] for title in titles]
    return titles

# get the articleId of articles having title = text
# source = articles_adv, articles_con, or articles_int

def get_articles_by_title(text, source):
    matches = [article for article in source if article['title'] == text]
    return matches

In [1]:
# get metadata for article matches
# currently, matches are only from articles_adv

sources = [articles_adv, articles_con, articles_int]

article_matches = []

def get_articles(search_text, source, matches):
    new_matches = get_articles_by_title(search_text, source)
    new_matches = [match for match in new_matches if match not in matches]
    matches += new_matches
    if not new_matches and source == articles_adv:
        titles = get_britannica_topic_titles(search_text)
        unique_titles = []
        [unique_titles.append(x) for x in titles if x not in unique_titles]
        for title in [title for title in unique_titles if title not in [match['title'] for match in matches]]:
            new_matches += get_articles_by_title(title, source)
        matches += new_matches
    return matches

for item in all_search_terms:
    these_matches = {'title': item['text'],
                       'primary': [],
                       'related': []}
    matches = []
    search_text = item['text']
    for source in sources:
        matches += get_articles(search_text, source, matches)
    these_matches['primary'] += matches
    num_related = len(item['related'])
    if not matches and num_related > 0:
        for i in range(num_related):
            search_text = get_text(item['related'][i])
            for source in sources:
                matches += get_articles(search_text, articles_adv, matches)
            these_matches['related'] += matches
    article_matches.append(these_matches)

pp.pprint(article_matches)

NameError: name 'articles_adv' is not defined

In [765]:
get_articles_by_title('information processing', articles_adv)

[{'articleId': 106312,
  'articleTypeId': 1,
  'title': 'information processing',
  'lastUpdated': '2021-07-01'}]

In [757]:
# get_articles_by_title('natural frequency', articles_adv)
get_britannica_topic_titles('natural-frequency')

['mechanics', 'spectroscopy', 'analysis', 'wave']