# Summary

Make wiki_summary function more capable of handling missing or ambiguous cases. Also see if image retrieval is easy/possible.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [125]:
from bs4 import BeautifulSoup as BS
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
import requests
from tldextract import extract
import wikipedia as wiki

from jabberwocky.config import C
from jabberwocky.openai_utils import load_prompt, load_openai_api_key
from jabberwocky.external_data import W
from htools import *

In [3]:
cd_root()

Current directory: /Users/hmamin/jabberwocky


In [134]:
def _wiki_search(name, *tags):
    terms = ['wikipedia'] + name.split() + list(tags)
    r = requests.get(f'http://www.google.com/search?q={"+".join(terms)}')
    links = BS(r.text, 'lxml').find_all('a')
    return [link.replace('/url?q=', '')
            for link in map(lambda x: x['href'], links) 
            if link.startswith('/url?q=')]

In [15]:
def wiki_summary(name, *tags):
    page = W.page(name.title().replace(' ', '_'))
    if not page.exists():
        raise RuntimeError('Wikipedia page not found. Provide a URL instead.')
    summary = page.summary.splitlines()[0]
    if summary.endswith('may refer to:'):
        raise RuntimeError('Ambiguous search term. Provide a URL instead.')
    return summary

In [9]:
wiki_summary('j.k. rowling')

'Joanne Rowling  ( ROH-ling; born 31 July 1965), better known by her pen name J. K. Rowling, is a British author, philanthropist, film producer, television producer, and screenwriter. She is best known for writing the Harry Potter fantasy series, which has won multiple awards and sold more than 500 million copies, becoming the best-selling book series in history. The books are the basis of a popular film series, over which Rowling had overall approval on the scripts and was a producer on the final films. She also writes crime fiction under the pen name Robert Galbraith.'

In [31]:
wiki_summary('jk rowling')

RuntimeError: Wikipedia page not found. Provide a URL instead.

In [13]:
wiki_summary('chris lee')

'Christopher Lee (1922–2015) was an English actor and singer.'

In [57]:
r = _wiki_search('chris lee', 'politician')

In [59]:
eprint(r[:5])

 0: /url?q=https://en.wikipedia.org/wiki/Chris_Lee_(New_York_politician)&sa=U&ved=2ahUKEwibvu6Ch4fxAhX1IDQIHXM-AQgQFjAAegQIBBAB&usg=AOvVaw3enElqWa8R0y2pOkEEO5tz
 1: /url?q=https://en.wikipedia.org/wiki/Chris_Lee_(New_York_politician)%23Biography&sa=U&ved=2ahUKEwibvu6Ch4fxAhX1IDQIHXM-AQgQ0gIwAHoECAQQAg&usg=AOvVaw12Agt6zY_XL1Xa9tjlYGzz
 2: /url?q=https://en.wikipedia.org/wiki/Chris_Lee_(New_York_politician)%23Political_campaigns&sa=U&ved=2ahUKEwibvu6Ch4fxAhX1IDQIHXM-AQgQ0gIwAHoECAQQAw&usg=AOvVaw1ZKZmRVWBsqKA8zcN8V8H-
 3: /url?q=https://en.wikipedia.org/wiki/Chris_Lee_(New_York_politician)%23U.S._House_of_Representatives&sa=U&ved=2ahUKEwibvu6Ch4fxAhX1IDQIHXM-AQgQ0gIwAHoECAQQBA&usg=AOvVaw1pKb3uxYvCyhZLp3jc0lCe
 4: /url?q=https://en.wikipedia.org/wiki/Chris_Lee_(New_York_politician)%23First_term&sa=U&ved=2ahUKEwibvu6Ch4fxAhX1IDQIHXM-AQgQ0gIwAHoECAQQBQ&usg=AOvVaw0Zu3oRyNvkz5EArWYoddOE


In [66]:
extract(r[0].replace('/url?q=', ''))

ExtractResult(subdomain='en', domain='wikipedia', suffix='org')

In [88]:
duck_url_fmt = 'https://duckduckgo.com/?q=!ducky+{}+site%3Awikipedia.org'
terms = 'chris lee politician'
r = requests.get(duck_url_fmt.format('+'.join(terms.split())),
                 headers={'user-agent': 'jabberwocky'})
r

<Response [200]>

In [89]:
r

<Response [200]>

In [90]:
r.text

"<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><meta name='referrer' content='origin'><meta name='robots' content='noindex, nofollow'><meta http-equiv='refresh' content='0; url=/l/?uddg=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FChris_Lee_(New_York_politician)&rut=301a9a08127652533377bab64eb867db64987c1add90ea92d3bafbccfeecbcde'></head><body><script language='JavaScript'>function ffredirect(){window.location.replace('/l/?uddg=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FChris_Lee_(New_York_politician)&rut=301a9a08127652533377bab64eb867db64987c1add90ea92d3bafbccfeecbcde');}setTimeout('ffredirect()',100);</script></body></html>"

In [91]:
r.url

'https://duckduckgo.com/?q=!ducky+chris+lee+politician+site%3Awikipedia.org'

In [None]:
'https://www.google.com/search?btnI=1&q=%s site:developer.mozilla.org'

In [136]:
wiki.PageError

wikipedia.exceptions.PageError

In [138]:
try:
    print(wiki.summary('Derek Jetero', 
                       auto_suggest=False).content.splitlines()[0])
except wiki.PageError:
    print('Not found')

Not found


In [111]:
print(wiki.page('Derek Jeter', auto_suggest=False).content.splitlines()[0])

Derek Sanderson Jeter ( JEE-tər; born June 26, 1974) is an American former professional baseball shortstop, businessman, and baseball executive. He has been the chief executive officer (CEO) and part owner of the Miami Marlins of Major League Baseball (MLB) since September 2017. As a player, Jeter spent his entire 20-year MLB career with the New York Yankees. He was elected to the Baseball Hall of Fame in his first year of eligibility in 2020; he received 396 of 397 possible votes (99.75%), the second-highest percentage in MLB history and the highest by a position player.


In [139]:
print(wiki.summary('Derek Jeter', auto_suggest=False).splitlines()[0])

Derek Sanderson Jeter ( JEE-tər; born June 26, 1974) is an American former professional baseball shortstop, businessman, and baseball executive. He has been the chief executive officer (CEO) and part owner of the Miami Marlins of Major League Baseball (MLB) since September 2017. As a player, Jeter spent his entire 20-year MLB career with the New York Yankees. He was elected to the Baseball Hall of Fame in his first year of eligibility in 2020; he received 396 of 397 possible votes (99.75%), the second-highest percentage in MLB history and the highest by a position player.


In [122]:
matches = wiki.search('Chris Lee politician')
for match in matches:
    if '(disambiguation)' in match: continue
    summary = wiki.summary(match, auto_suggest=False)
    print(summary.splitlines()[0])
    break

Christopher John Lee (born April 1, 1964) is a former Republican member of the United States House of Representatives for New York's 26th congressional district. He served from January 2009 until he resigned on February 9, 2011, after it was revealed that he had solicited a woman on Craigslist.


In [127]:
matches = wiki.search('Joanne Rowling author')
for match in matches:
    if '(disambiguation)' in match: continue
    summary = wiki.summary(match, auto_suggest=False)
    print(summary.splitlines()[0])
    break

Joanne Rowling  ( ROH-ling; born 31 July 1965), better known by her pen name J. K. Rowling, is a British author, philanthropist, film producer, television producer, and screenwriter. She is best known for writing the Harry Potter fantasy series, which has won multiple awards and sold more than 500 million copies, becoming the best-selling book series in history. The books are the basis of a popular film series, over which Rowling had overall approval on the scripts and was a producer on the final films. She also writes crime fiction under the pen name Robert Galbraith.


In [128]:
re.sub('\s{2,}', ' ', summary.splitlines()[0])

'Joanne Rowling ( ROH-ling; born 31 July 1965), better known by her pen name J. K. Rowling, is a British author, philanthropist, film producer, television producer, and screenwriter. She is best known for writing the Harry Potter fantasy series, which has won multiple awards and sold more than 500 million copies, becoming the best-selling book series in history. The books are the basis of a popular film series, over which Rowling had overall approval on the scripts and was a producer on the final films. She also writes crime fiction under the pen name Robert Galbraith.'

In [133]:
_wiki_text_cleanup(summary.splitlines()[0])

'Joanne Rowling (ROH-ling; born 31 July 1965), better known by her pen name J. K. Rowling, is a British author, philanthropist, film producer, television producer, and screenwriter. She is best known for writing the Harry Potter fantasy series, which has won multiple awards and sold more than 500 million copies, becoming the best-selling book series in history. The books are the basis of a popular film series, over which Rowling had overall approval on the scripts and was a producer on the final films. She also writes crime fiction under the pen name Robert Galbraith.'

In [143]:
from wikipedia import PageError

In [132]:
def _wiki_text_cleanup(text):
    text = text.replace('( ', '(').replace(' )', ')')
    return re.sub('\s{2,}', ' ', text)

In [187]:
def wiki_page(name, *tags, retry=True, debug=False):
    try:
        return wiki.page(name, auto_suggest=False)
    except PageError:
        if not retry:
            raise ValueError(f'Couldn\'t find wikipedia page for {name}.') \
                from None
        warnings.warn('Page not found. Trying to auto-select correct match.')
        
        terms = ' '.join(name.split() + list(tags))
        matches = wiki.search(terms)
        if debug: print('matches:', matches)
        for match in matches:
            if '(disambiguation)' in match: continue
            return wiki_page(match, retry=False)

In [198]:
def download_image(url, out_path, verbose=False):
    """Ported from spellotape. Given a URL, fetch an image and download it to
    the specified path.
    
    Parameters
    ----------
    url: str
        Location of image online.
    out_path: str
        Path to download the image to.
    verbose: bool
        If True, prints a message alerting the user when the image could not 
        be retrieved.
        
    Returns
    -------
    bool: Specifies whether image was successfully retrieved.
    """
    try:
        with requests.get(url, stream=True, timeout=10) as r:
            if r.status_code != 200:
                if verbose: print(f'STATUS CODE ERROR: {url}')
                return False

            # Write bytes to file chunk by chunk.
            with open(out_path, 'wb') as f:
                for chunk in r.iter_content(256):
                    f.write(chunk)
            
    # Any time url cannot be accessed, don't care about exact error.
    except Exception as e:
        if verbose: print(e)
        return False

    return True

In [206]:
def wiki_data(name, img_path=None, *tags, **kwargs):
    page = wiki_page(name, *tags, **kwargs)
    summary = page.summary.splitlines()[0]
    if img_path and page.images and download_image(page.images[0], img_path):
        has_img = True
    else:
        has_img = False
    return Results(summary=_wiki_text_cleanup(summary),
                   has_img=has_img)

In [178]:
wiki_page('jo rowling', 'author')

  import sys


<WikipediaPage 'J. K. Rowling'>

In [181]:
wiki_page('mike schur')

  import sys


<WikipediaPage 'Michael Schur'>

In [182]:
# Seems to strugle with typos.
wiki_page('yan lecun', 'machine learning', debug=True)

  import sys


matches: ['Deep learning', 'History of artificial intelligence', 'Glossary of artificial intelligence', 'Darkforest', 'Synthetic media']


<WikipediaPage 'Deep learning'>

In [207]:
res = wiki_data('Usain Bolt', 'data/tmp/wiki_photo.png')

In [208]:
res.summary

'Usain St Leo Bolt, (; born 21 August 1986) is a Jamaican retired sprinter, widely considered to be the greatest sprinter of all time. He is a world record holder in the 100 metres, 200 metres and 4 × 100 metres relay. '

In [211]:
W.page('Usain_Bolt').summary

'Usain St Leo Bolt,  (; born 21 August 1986) is a Jamaican retired sprinter, widely considered to be the greatest sprinter of all time. He is a world record holder in the 100 metres, 200 metres and 4 × 100 metres relay. \nAn eight-time Olympic gold medallist, Bolt is the only sprinter to win Olympic 100 m and 200 m titles at three consecutive Olympics (2008, 2012 and 2016). He also won two 4 × 100 relay gold medals. He gained worldwide fame for his double sprint victory in world record times at the 2008 Beijing Olympics, which made him the first person to hold both records since fully automatic time became mandatory. \nAn eleven-time World Champion, he won consecutive World Championship 100 m, 200 m and 4 × 100 metres relay gold medals from 2009 to 2015, with the exception of a 100 m false start in 2011. He is the most successful male athlete of the World Championships. Bolt is the first athlete to win four World Championship titles in the 200 m and is one of the most successful in the