In [356]:
from language_processing import *
import json
import pprint as pp
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from dotenv import load_dotenv

In [None]:
load_dotenv()
nltk.download('punkt');
nltk.download('averaged_perceptron_tagger');
nltk.download('wordnet');
nltk.download('omw-1.4')

In [353]:
# load article lists

with open('data/articles/advanced.json') as file:
    articles_adv = json.load(file)

with open('data/articles/intermediate.json') as file:
    articles_int = json.load(file)

with open('data/articles/concise.json') as file:
    articles_con = json.load(file)

In [354]:
# remove articles which have an empty 'title' field

for article in articles_adv:
    if article['title'] == '':
        articles_adv.remove(article)

for article in articles_int:
    if article['title'] == '':
        articles_int.remove(article)

for article in articles_con:
    if article['title'] == '':
        articles_con.remove(article)

In [355]:
# load search terms

with open('data/search_terms/primary_and_related.json') as file:
    all_search_terms = json.load(file)

primary = pd.read_csv('data/search_terms/primary.csv', header=0).squeeze('columns').tolist()

primary_clean = []
for item in primary:
    parsed = item.strip('[]')
    tuple_pattern = r"\('(\w+)',\s'([A-Z]+)'\)"
    tuples = re.findall(tuple_pattern, parsed)
    primary_clean.append(tuples)

related = pd.read_csv('data/search_terms/related.csv', header=0).squeeze('columns').tolist()

related_clean = []
for item in related:
    parsed = item.strip('[]')
    tuple_pattern = r"\('(\w+)',\s'([A-Z]+)'\)"
    tuples = re.findall(tuple_pattern, parsed)
    related_clean.append(tuples)

In [393]:
def get_text(words):
    text = ' '.join(word[0] for word in words)
    return text

def get_articles_by_title(text, source):
    matches = [article for article in source if article['title'] == text]
    return matches

def get_pos(word):
    tag = word[1]
    if tag in noun_labels:
        pos = 'n'
    elif tag in verb_labels:
        pos = 'v'
    elif tag in adj_labels:
        pos = 'a'
    elif tag in adv_labels:
        pos = 'r'
    return pos

def get_syns(word, sense):
    text = get_text([word])
    pos = get_pos(word)
    sense = f'{sense:02d}'
    name = '.'.join([text, pos, sense])
    syns = wordnet.synset(name)
    return syns

# def get_similar(search_phrase):
#     max_depth = 3
#     num_words = len(search_phrase)
#     sim_phrases = []
#     for i in range(max_depth):
#         for j in range(num_words):
#             syns = get_syns(search_phrase[j], i)
#             words = []
#             for k in range(num_words):


In [402]:
# testing...

get_syns(('movement', 'NN'), 0).definition()

'the act of changing the location of something'

In [304]:
# under construction

matches_adv = []

for search_phrase in primary_clean:
    match_type = 0
    search_text = get_text(search_phrase)
    matches = get_articles_by_title(search_text, articles_adv)
    if matches:
        match_type = 1
    is_noun = [word[1] in noun_labels for word in search_phrase]
    if match_type == 0 and any(is_noun):
        words = []
        for i in range(len(search_phrase)):
            word = get_singular(search_phrase[i])
            words.append(word)
        search_text = ' '.join(words)
        matches = get_articles_by_title(search_text, articles_adv)
        if matches:
            match_type = 2
    # elif match_type == 0 (and other conditions):
        # check other types of similarity
        # crawl https://www.britannica.com/science/<search_phrase>
        # crawl https://www.britannica.com/search?query=<search_phrase>

    # print('{}: {}'.format(get_text(search_phrase), match_type))
    matches_adv.append(matches)

matches_adv = [x for x in matches_adv if x]

pp.pprint(matches_adv)

[[{'articleId': 35365,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'frequency'}],
 [{'articleId': 32278,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'electric field'}],
 [{'articleId': 472325,
   'articleTypeId': 1,
   'lastUpdated': '2022-03-16',
   'title': 'field'}],
 [{'articleId': 34834,
   'articleTypeId': 1,
   'lastUpdated': '2021-10-21',
   'title': 'force'}],
 [{'articleId': 53958,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'motion'},
  {'articleId': 53959,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'motion'}],
 [{'articleId': 3959,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'afterimage'}],
 [{'articleId': 44086,
   'articleTypeId': 1,
   'lastUpdated': '2021-06-30',
   'title': 'judgment'}],
 [{'articleId': 110698,
   'articleTypeId': 1,
   'lastUpdated': '2021-10-15',
   'title': 'film'}],
 [{'articleId': 63015,
   'articleTypeId': 1,
   'lastUpdated': '2021