In [1]:
import sys
import re
import json
import urllib.request
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
sys.path.insert(0, '../scripts')
from WikiPage import WikiPage

# Load Data

In [2]:
dog_dat = pd.read_csv('../data/Wikipedia-dog.csv', index_col=0)
fish_dat = pd.read_csv('../data/Wikipedia-fish.csv', index_col=0)
dat = pd.concat([dog_dat, fish_dat])
dat.head()

Unnamed: 0,website,title,page_type,text_raw
970284,https://en.wikipedia.org/wiki/Category:Dog_sho...,Category:Dog shows and showing,category,{{Cat main|Conformation show|Show dog}}\n{{por...
972913,https://en.wikipedia.org/wiki/Category:Dog_health,Category:Dog health,category,This is a collection of articles about the hea...
970251,https://en.wikipedia.org/wiki/Category:Dog_org...,Category:Dog organizations,category,This is an automatically collected list of art...
729436,https://en.wikipedia.org/wiki/Category:Dog_sports,Category:Dog sports,category,This is an automatically accumulated list of a...
978163,https://en.wikipedia.org/wiki/Category:Dogs_as...,Category:Dogs as pets,category,[[Category:Dogs|Pets]]\n[[Category:Mammals as ...


In [3]:
dog_xmlfile = '../data/Wikipedia-dog.xml'
dog_tree = ET.parse(dog_xmlfile)
dog_root = dog_tree.getroot()

fish_xmlfile = '../data/Wikipedia-fish.xml'
fish_tree = ET.parse(fish_xmlfile)
fish_root = fish_tree.getroot()

# Code

In [4]:
def get_url_from_wiki_id(wiki_id):
    wiki_id = str(wiki_id)
    request = "https://en.wikipedia.org/w/api.php?action=query&prop=info&pageids=__pageids__&inprop=url&format=json"
    id_request = request.replace("__pageids__", wiki_id)
    s = urllib.request.urlopen(id_request).read()
    article_dict = json.loads(s)
    return str(article_dict['query']['pages'][wiki_id]['fullurl'])

In [5]:
def find_wiki_links(text, remove_non_articles=True):
    searchstr = '(\[[^\[\]]*\[.*?\][^\]\[]*\])'
    link_urls = []
    link_titles = []
    link_titles_raw = re.findall(searchstr, text)
    for link_title_raw in link_titles_raw:
        link_title = re.sub('\|.+', '', link_title_raw)#remove junk at end
        link_title = re.sub('\#.+', '', link_title)#remove junk at end
        link_title = re.sub('\[\[', '', link_title)#remove brackets
        link_title = re.sub('\]\]', '', link_title)#remove brackets
        if remove_non_articles:
            if link_type_from_title(link_title) != 'article':
                continue
        link_titles.append(link_title)
        link_title = re.sub(' ', '_',  link_title)
        link_url = 'https://en.wikipedia.org/wiki/%s' % (link_title)
        link_urls.append(link_url)
    return list(set(zip(link_titles, link_urls)))

def link_type_from_title(title):
    if 'Category:' in title:
        return 'category'
    elif 'Portal:' in title:
        return 'portal'
    elif 'List of' in title:
        return 'list'
    elif 'File:' in title:
        return 'file'
    elif 'Image:' in title:
        return 'image'
    else:
        return 'article'

In [6]:
def query_wiki_api(title):
    base_query = 'https://en.wikipedia.org/w/api.php?action=query&titles= &prop=revisions&rvprop=content&format=json'
    query = re.sub(' ', title, base_query)
    query = re.sub(' ', '%20', query)
    result = urllib.request.urlopen(query).read()
    article_dict = json.loads(result)
    return article_dict

def get_text_from_wiki_api(title):
    article_dict = query_wiki_api(title)
    wiki_id = list(article_dict['query']['pages'].keys())[0]
    try:
        return article_dict['query']['pages'][wiki_id]['revisions'][0]['*']
    except:
        return ''

In [7]:
def generate_dist_1_links_from_df(df):
    '''
    For each article, get a list of the titles of all internally-linked Wiki articles
    INPUT
    df : pd.DataFrame
      rows: wiki ids, columns: titles, text_raw
      
    OUTPUT
    pd.DataFrame
      rows: wiki ids, columns: titles, dist1_titles (titles of 1-distance links)
    '''
    out_df = pd.DataFrame(columns = ['title', 'dist1_titles'])
    for idx in df.index:
        text = df.text_raw.loc[idx]
        title = df.title.loc[idx]
        dist1_links = np.array(find_wiki_links(text))
        try:
            dist1_titles = list(dist1_links[:,0])
            out_df.loc[idx] = [title, dist1_titles]
        except:
            out_df.loc[idx] = [title, np.nan]
    return out_df

# Test Scripts

In [8]:
wp = WikiPage(dog_root[45])
print(wp.text_parentid)
#dat.loc[wp.text_parentid]

790810978


In [9]:
wiki_id = str(wp.text_parentid)
request = "https://en.wikipedia.org/w/api.php?action=query&prop=info&pageids=__pageids__&inprop=url&format=json"
id_request = request.replace("__pageids__", wiki_id)
s = urllib.request.urlopen(id_request).read()
article_dict = json.loads(s)
#print(str(article_dict['query']['pages'][wiki_id]['fullurl']))

In [10]:
article_dict

{'batchcomplete': '',
 'query': {'pages': {'790810978': {'missing': '', 'pageid': 790810978}}}}

In [11]:
print(wp.website)

https://en.wikipedia.org/wiki/Origin_of_the_domestic_dog


In [12]:
text = wp.text_raw
print(text)

{{pp-semi-indef}}
[[File:Canis lupus howling on glacial erratic.jpg|thumb|400px|The dog diverged from a now-extinct population of wolves immediately before the [[Last Glacial Maximum]], when much of Eurasia was a cold, dry [[mammoth steppe]] [[biome]].]]
The origin of the domestic dog is not clear. The [[domestic dog]] is a member of genus ''[[Canis]]'' (canines) that forms part of the [[Evolution of the wolf#Wolf-like canids|wolf-like canids]], and is the most widely abundant [[carnivore]].<ref name=fan2016/><ref name=thalmann2013/><ref name=vila1999/> The closest living relative of the dog is the [[gray wolf]] and there is no evidence of any other canine contributing to its genetic lineage.<ref name=fan2016/><ref name=thalmann2013/><ref name=vonholdt2010/><ref name=vila1997/> The dog and the extant gray wolf form two sister [[clades]],<ref name=vila1997/><ref name=freedman2014/><ref name=larson2014/> with modern wolves not closely related to the wolves that were first domesticated.<r

In [13]:
print(find_wiki_links(text))

[('Forensic Science International: Genetics', 'https://en.wikipedia.org/wiki/Forensic_Science_International:_Genetics'), ('agriculturalists', 'https://en.wikipedia.org/wiki/agriculturalists'), ('yDNA', 'https://en.wikipedia.org/wiki/yDNA'), ('Quaternary glaciation', 'https://en.wikipedia.org/wiki/Quaternary_glaciation'), ('Honshu', 'https://en.wikipedia.org/wiki/Honshu'), ('carnivorous', 'https://en.wikipedia.org/wiki/carnivorous'), ('glycogen', 'https://en.wikipedia.org/wiki/glycogen'), ('Lapponian Herder', 'https://en.wikipedia.org/wiki/Lapponian_Herder'), ('Pleistocene megafauna', 'https://en.wikipedia.org/wiki/Pleistocene_megafauna'), ('starch', 'https://en.wikipedia.org/wiki/starch'), ('wild boar', 'https://en.wikipedia.org/wiki/wild_boar'), ('On the Origin of Species', 'https://en.wikipedia.org/wiki/On_the_Origin_of_Species'), ('whole genome sequencing', 'https://en.wikipedia.org/wiki/whole_genome_sequencing'), ('Taymyr Peninsula', 'https://en.wikipedia.org/wiki/Taymyr_Peninsula'

In [14]:
test_title = find_wiki_links(text)[0][0]
print(test_title)

Forensic Science International: Genetics


In [15]:
test_text = get_text_from_wiki_api(test_title)
print(test_text)

{{Infobox journal
| title         = Forensic Science International: Genetics
| cover         = [[File:Forensic_Science_International,_Genetics_cover.jpg|150px]]
| caption       = 
| former_name   = <!-- or |former_names= -->
| abbreviation  = Forensic Sci. Int. Genet.
| discipline    = [[Forensic science]]<!-- or |subject= -->
| peer-reviewed = 
| language      = 
| editor        = [[A. Carracedo]]
| publisher     = [[Elsevier]]
| country       = 
| history       = 2007&ndash;present
| frequency     = Bimonthly
| openaccess    = 
| license       = 
| impact        = 4.604
| impact-year   = 2014
| ISSNlabel     = Forensic Science International: Genetics
| ISSN         = 1872-4973
| eISSN        =
| CODEN         =
| JSTOR         = 
| LCCN          = 
| OCLC          = 901016687
| ISSN2label    = Forensic Science International: Genetics Supplement Series
| ISSN2         = 1875-1768
| eISNN2        = 
| CODEN2        =
| JSTOR2        =
| LCCN2         =
| OCLC2         = 605195741
| web

In [16]:
test_links = np.array(find_wiki_links(test_text))
test_links[:,0]

array(['Scopus', 'Excerpta Medica', 'MEDLINE', 'peer-review',
       'forensic science', 'Elsevier', 'EMBASE',
       'University of Santiago de Compostela', 'Forensic science',
       'academic journal', 'A. Carracedo', 'Forensic Science International'], 
      dtype='<U66')

In [17]:
find_wiki_links(test_text)

[('Scopus', 'https://en.wikipedia.org/wiki/Scopus'),
 ('Excerpta Medica', 'https://en.wikipedia.org/wiki/Excerpta_Medica'),
 ('MEDLINE', 'https://en.wikipedia.org/wiki/MEDLINE'),
 ('peer-review', 'https://en.wikipedia.org/wiki/peer-review'),
 ('forensic science', 'https://en.wikipedia.org/wiki/forensic_science'),
 ('Elsevier', 'https://en.wikipedia.org/wiki/Elsevier'),
 ('EMBASE', 'https://en.wikipedia.org/wiki/EMBASE'),
 ('University of Santiago de Compostela',
  'https://en.wikipedia.org/wiki/University_of_Santiago_de_Compostela'),
 ('Forensic science', 'https://en.wikipedia.org/wiki/Forensic_science'),
 ('academic journal', 'https://en.wikipedia.org/wiki/academic_journal'),
 ('A. Carracedo', 'https://en.wikipedia.org/wiki/A._Carracedo'),
 ('Forensic Science International',
  'https://en.wikipedia.org/wiki/Forensic_Science_International')]

In [18]:
dog_link_df = generate_dist_1_links_from_df(dog_dat)
#dog_link_df.to_csv('../data/Wikipedia-dog-links.csv')
dog_link_df.head()

Unnamed: 0,title,dist1_titles
970284,Category:Dog shows and showing,[conformation show]
972913,Category:Dog health,
970251,Category:Dog organizations,[dog]
729436,Category:Dog sports,
978163,Category:Dogs as pets,


In [19]:
fish_link_df = generate_dist_1_links_from_df(fish_dat)
#fish_link_df.to_csv('../data/Wikipedia-fish-links.csv')
fish_link_df.head()

Unnamed: 0,title,dist1_titles
12161313,Category:Lists of fishes,
4699587,Fish,"[freshwater, Embiotocidae, arthropod, koi, Chi..."
1834140,Category:Fish stubs,[fish]
11037163,Category:Fish health,
7998150,Category:Fish anatomy,


# In Progress

In [21]:
def wiki_article_path_length(title_a, title_b, path_length=0, max_path_length=5):
    '''
    Finds path length between title_a and title_b
    (i.e. number of internal wiki links that must be traversed)
    
    INPUT
    title_a, title_b : str
      title of a Wiki article
    
    OUTPUT
    path_length : int
      number of links that must be traversed to get from title_a to title_b
    '''
    if title_a == title_b:
        return path_length
    while path_length <= 5:
        path_length += 1
        tmp_links = find_wiki_links(get_text_from_wiki_api(title_a))
        if title_b in tmp_links:
            return path_length
        else:
            for tmp_title, _ in tmp_links:
                wiki_article_path_length(tmp_title, title_b, path_length)
    return max_path_length

In [22]:
#wiki_article_path_length('therapy dog', 'Service animal')
title_a = 'therapy dog'
title_b = 'Service animal'
if title_b in find_wiki_links(title_a):
    print('1')
#wiki_article_path_length(title_a, title_b)