In [17]:
import json
import pprint as pp
import pandas as pd
import re
import inflect
from spellchecker import SpellChecker
import time
import requests
from dotenv import load_dotenv
import os

In [18]:
load_dotenv()
spell = SpellChecker(language='en')
p = inflect.engine()

In [19]:
# read unique phenomena into a list

phenomena = pd.read_csv('data/phenomena/phenomena.csv').squeeze('columns').to_list()
pp.pprint(phenomena)

['Natural Frequency',
 'Electric Fields and Forces',
 'Motion: Accelerated Gravitational Motion',
 'Afterimage',
 'Judgment',
 'Thin Films',
 'Frame of Reference',
 'Nodes and Antinodes',
 'Sublimation',
 'Bernoulli Effect',
 'Color Mixing: Subtractive',
 'Size-Distance',
 'Upwelling',
 'Nonlinear Behavior',
 'Information Processing and Encoding',
 'Resonance',
 'Perception: Spatial',
 'Attention',
 'Melting',
 'Visual Edge Effects',
 'Superposition',
 'Exponentials',
 'Surface Tension of Liquids',
 'Numbers',
 'Color: Complimentary',
 'Binocular Vision',
 'Crystallization',
 'Fatigue',
 'Color Vision',
 'Oscillation',
 'Iridescence',
 'Conservation of Angular Momentum',
 'Decomposition',
 'Polarization',
 'Evaporation',
 'Close Packing',
 'Ferromagnetism',
 'Parabolas',
 'Metacognition',
 'Albedo',
 'Depth Perception',
 'Waves: Transverse',
 'Probability',
 'Shadows',
 'Rhythm',
 'Motion: Composition of Perpendicular Motion',
 'Motion: Simple Harmonic Motion',
 'Social Loafing',
 'Gra

In [20]:
# expand each phenomenon into search phrases to compare against article titles

phenomena_expanded = []

for phenomenon in phenomena:
    this_phenom = [phenomenon.lower()]
    phenom_parsed = re.split(': ', phenomenon)
    if len(phenom_parsed) > 1:
        for i in range(len(phenom_parsed)):
            this_phenom.append(phenom_parsed[i].lower())
        phenom_alt = ' '.join(phenom_parsed[::-1])
        double_word = r'\b(\w+)\s+\1\b'
        if not re.search(double_word, phenom_alt):
            this_phenom.append(phenom_alt.lower())
    for phrase in phenom_parsed:
        phrase_parsed_1 = re.split(' or | of | in ', phrase)
        if len(phrase_parsed_1) > 1:
            for j in range(len(phrase_parsed_1)):
                this_phenom.append(phrase_parsed_1[j].lower())
        phrase_parsed_2 = re.split(' and ', phrase)
        if len(phrase_parsed_2) > 1:
            this_phenom.append(phrase_parsed_2[0].lower())
            phrase_parsed_2_split = phrase_parsed_2[0].split()
            if len(phrase_parsed_2_split) > 1:
                this_phenom.append(' '.join([phrase_parsed_2_split[0].lower(), phrase_parsed_2[1].lower()]))
            else:
                this_phenom.append(phrase_parsed_2[1].lower())

    phenomena_expanded.append(this_phenom)

pp.pprint(phenomena_expanded)

[['natural frequency'],
 ['electric fields and forces', 'electric fields', 'electric forces'],
 ['motion: accelerated gravitational motion',
  'motion',
  'accelerated gravitational motion'],
 ['afterimage'],
 ['judgment'],
 ['thin films'],
 ['frame of reference', 'frame', 'reference'],
 ['nodes and antinodes', 'nodes', 'antinodes'],
 ['sublimation'],
 ['bernoulli effect'],
 ['color mixing: subtractive',
  'color mixing',
  'subtractive',
  'subtractive color mixing'],
 ['size-distance'],
 ['upwelling'],
 ['nonlinear behavior'],
 ['information processing and encoding',
  'information processing',
  'information encoding'],
 ['resonance'],
 ['perception: spatial', 'perception', 'spatial', 'spatial perception'],
 ['attention'],
 ['melting'],
 ['visual edge effects'],
 ['superposition'],
 ['exponentials'],
 ['surface tension of liquids', 'surface tension', 'liquids'],
 ['numbers'],
 ['color: complimentary', 'color', 'complimentary', 'complimentary color'],
 ['binocular vision'],
 ['crysta

In [21]:
# read unique keywords into a list

keywords = pd.read_csv('data/keywords/keywords.csv').squeeze('columns').to_list()
pp.pprint(keywords)

['lenses',
 'center of gravity',
 'vibration',
 'weather',
 'identity',
 'cornsweet illusion',
 'acoustics',
 'evaporation',
 'chicken wire',
 'Ocean',
 'Sharks',
 'Neon',
 'Migrations',
 "Huygens' principle",
 'mirrors',
 'turbulence',
 'carbon dioxide',
 'parallel',
 'phase angle',
 'parabolas',
 'organisms',
 'cosmic rays',
 'simultaneous contrast',
 'series',
 'chaos',
 'amplification',
 'genetics',
 'lasers',
 'artwork',
 'membrane',
 "Newton's Laws",
 'reflections',
 'ratio',
 'circuit',
 'ping pong balls',
 'DNA',
 'levitation',
 'navigation',
 'curves',
 'illusions',
 'model organisms',
 'Whales',
 'gases',
 'portraits',
 'sun',
 'bicycle wheel',
 'speakers',
 'identical twins',
 'pendulums',
 'dissection',
 'eyes',
 'nerves',
 'wind',
 'zebrafish',
 'heat transfer',
 'volcanoes',
 'Marine',
 'Pacific',
 'complexity',
 'Turtles',
 'condensation',
 'appearance',
 'unicellular organisms',
 'tides',
 'ganzfeld effect',
 'afterimage',
 'fragmentation',
 'CMY retina',
 'balance',
 '

In [22]:
# load article lists

with open('data/articles/advanced.json') as file:
    articles_adv = json.load(file)

with open('data/articles/intermediate.json') as file:
    articles_int = json.load(file)

with open('data/articles/concise.json') as file:
    articles_con = json.load(file)

In [23]:
# remove articles which have an empty 'title' field

for article in articles_adv:
    if article['title'] == '':
        articles_adv.remove(article)

for article in articles_int:
    if article['title'] == '':
        articles_int.remove(article)

for article in articles_con:
    if article['title'] == '':
        articles_con.remove(article)

In [27]:
articles_adv_phenom = []

for phenomenon in phenomena_expanded:
    this_phenom = [phenomenon[0]]
    for i in range(len(phenomenon)):
        for article in articles_adv:
            if p.compare(phenomenon[i], article['title'].lower()):
                this_phenom += [article['title'], article['articleId']]
                break
    articles_adv_phenom.append(this_phenom)

pp.pprint(articles_adv_phenom)

# IndexError (list index out of range) -- why??

IndexError: list index out of range

In [25]:
articles_int_phenom = []

for phenomenon in phenomena_expanded:
    this_phenom = [phenomenon[0]]
    for i in range(len(phenomenon)):
        for article in articles_int:
            if p.compare(phenomenon[i], article['title'].lower()):
                this_phenom += [article['title'], article['articleId']]
                break
    articles_int_phenom.append(this_phenom)

pp.pprint(articles_int_phenom)

[['natural frequency'],
 ['electric fields and forces'],
 ['motion: accelerated gravitational motion', 'motion', 623232],
 ['afterimage'],
 ['judgment'],
 ['thin films'],
 ['frame of reference'],
 ['nodes and antinodes'],
 ['sublimation'],
 ['bernoulli effect'],
 ['color mixing: subtractive'],
 ['size-distance'],
 ['upwelling'],
 ['nonlinear behavior'],
 ['information processing and encoding'],
 ['resonance'],
 ['perception: spatial'],
 ['attention'],
 ['melting'],
 ['visual edge effects'],
 ['superposition'],
 ['exponentials'],
 ['surface tension of liquids', 'liquid', 603530],
 ['numbers'],
 ['color: complimentary', 'color', 273736],
 ['binocular vision'],
 ['crystallization'],
 ['fatigue', 'fatigue', 274278],
 ['color vision'],
 ['oscillation'],
 ['iridescence', 'Isis and Osiris', 275096],
 ['conservation of angular momentum', 'conservation', 273782],
 ['decomposition'],
 ['polarization'],
 ['evaporation', 'evaporation', 339359],
 ['close packing'],
 ['ferromagnetism'],
 ['parabolas

In [26]:
articles_con_phenom = []

for phenomenon in phenomena_expanded:
    this_phenom = [phenomenon[0]]
    for i in range(len(phenomenon)):
        for article in articles_con:
            if p.compare(phenomenon[i], article['title'].lower()):
                this_phenom += [article['title'], article['articleId']]
                break
    articles_con_phenom.append(this_phenom)

pp.pprint(articles_con_phenom)

[['natural frequency'],
 ['electric fields and forces', 'electric field', 363544],
 ['motion: accelerated gravitational motion', 'motion', 372696],
 ['afterimage'],
 ['judgment', 'judgment', 368738],
 ['thin films'],
 ['frame of reference'],
 ['nodes and antinodes'],
 ['sublimation', 'sublimation', 379779],
 ['bernoulli effect'],
 ['color mixing: subtractive'],
 ['size-distance'],
 ['upwelling'],
 ['nonlinear behavior'],
 ['information processing and encoding', 'information processing', 367972],
 ['resonance', 'resonance', 376766],
 ['perception: spatial', 'perception', 374885],
 ['attention', 'attention', 356099],
 ['melting'],
 ['visual edge effects'],
 ['superposition'],
 ['exponentials'],
 ['surface tension of liquids', 'surface tension', 379928, 'liquid', 370336],
 ['numbers', 'number', 373752],
 ['color: complimentary'],
 ['binocular vision'],
 ['crystallization'],
 ['fatigue', 'fatigue', 364249],
 ['color vision'],
 ['oscillation'],
 ['iridescence', 'Osiris', 374204],
 ['conserv