### IMPORTS

#### Search Wikidata

In [None]:
from qwikidata.sparql import (get_subclasses_of_item, return_sparql_query_results)
import pandas as pd
import functions as fn

#### Beautiful Soup Scrape for Tables

In [None]:
from IPython.display import display, HTML
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
import re
import random
import json
import csv

from itertools import chain
import json

In [None]:
#Display options:
pd.set_option("max_rows", None)

### Wikidata Search

#### Get list of Ethiopian urls to scrape for tables

In [None]:
# Create list of column headers you want (reference the "SELECT" line in Query assignment)
headers = ['item','itemLabel','article_url','instanceOfLabel']

In [None]:
# Generate SPARQL Query 

sparql_query = """

PREFIX schema: <http://schema.org/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT DISTINCT ?item ?itemLabel ?article_url ?instanceOfLabel #?subLabel

WHERE {
  
  # item = sovereign state = Ethiopia
  ?item wdt:P17 wd:Q115 .
  
  # the item's property "instance of" 
  ?item wdt:P31 ?instanceOf . 

  OPTIONAL {
    
    # the item's "subclass of" category: commented out. filters out too many results
    #?item wdt:P279 ?sub .
    
    ?article_url schema:about ?item .
    ?article_url schema:inLanguage "en" .
    ?article_url schema:isPartOf <https://en.wikipedia.org/> . }
  
  SERVICE wikibase:label { 
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
    ?item rdfs:label ?itemLabel . 
    ?article_url rdfs:label ?article_url_label . 
    ?instanceOf rdfs:label ?instanceOfLabel . 
    ?sub rdfs:label ?subLabel .
  }
}

"""

results = return_sparql_query_results(sparql_query)

### CLEAN WIKIDATA 

In [None]:
df_clean = fn.clean_wikidata(results, headers)

In [None]:
# Create list of urls to scrape from wikidata
wikidata_urls = df_clean[["article_url", "instanceOfLabel"]].values.tolist()

### SCRAPE and CLEAN Wikipedia: 
https://en.wikipedia.org/wiki/Special:AllPages?from=ethiopia&to=&namespace=0

In [None]:
# Scrape the manually discovered wikipedia pages (related to Ethiopia) for tagged categories
wikipedia_urls = fn.scrape_wikipedia(df_clean)

### Join wikidata and wikipedia search lists

In [None]:
# Clean out trailing urls not associated with Ethiopia...via manual inspection:
wikipedia_urls = wikipedia_urls[:958]

# Add the wikidata and wikipedia url lists together with property tags
url_list_full = wikidata_urls + wikipedia_urls
print(f'Total urls to scrape for tables: {len(url_list_full)}')   

### Scrape wikipedia pages' urls and return any tables that are on the page

In [None]:
# table_scrapes('list of urls to scrape', 'probability of delay', 'max delay')
# delay prevents search from being blocked

# THIS SCRAPE TAKES ~1 HOUR TO COMPLETE

super_list = fn.table_scrapes(url_list_full, .1, 5)

In [None]:
# Display articles with number of relevant tables:
df_all = fn.display_all_results(super_list)

# clean up list structure of properties
df_all['properties'] = df_all.properties.apply(lambda x: x)
df_all.head(50)

### Explore some of the tables:

In [None]:
super_list[45]['Gilgel_Gibe_I_Dam'][0]

## Exploring Some Search Options (work in progess; not yet refined)

In [None]:
# GEOSPATIAL search for a keyword 
keyword = "cattle"
coord = fn.keyword_search(super_list, keyword)
point = fn.keyword_search(super_list, keyword)
loc = fn.keyword_search(super_list, keyword)
t = fn.keyword_search(super_list, keyword)

for tt in loc:
    print(tt[3])


In [None]:
pt_list = []
coord_list = []
loc_list =[]
for pt in point:
    pt_list.append(pt[0])
for pt in coord:
    coord_list.append(pt[0])
for pt in loc:
    loc_list.append(pt[0])    
ind_list = pt_list + coord_list+loc_list    
geo_de = set(ind_list)

print(f'point: {len(pt_list)}')
print(f'coord: {len(coord_list)}')
print(f'loc: {len(loc_list)}') 
print(f'set: {len(geo_de)}') 

In [None]:
# GEOSPATIAL TABLES
count = 0 
for tab in coord:
    ind, num, key = tab[0], tab[1], tab[2]
    temp = super_list[ind][key][num]
    display(HTML(temp.to_html()))
    count += 1

In [None]:
# TEMPORAL
keywords = ['time', 'date', 'months', 'year', "day", "founded"]

timer = fn.keyword_search(super_list, "time")
dater = fn.keyword_search(super_list, "date")
dayer = fn.keyword_search(super_list, "day")
monther = fn.keyword_search(super_list, "month")
yearer = fn.keyword_search(super_list, "year")
founder = fn.keyword_search(super_list, "founded")

In [None]:
for t in monther:
    ind = t[0]
    num = t[1]
    key = t[2]
    
    tab = super_list[ind][key][num]
    display(HTML(tab.to_html()))


In [None]:
print(f'time: {len(timer)}')
print(f'date: {len(dater)}')
print(f'day: {len(dayer)}')
print(f'month: {len(monther)}')
print(f'year: {len(yearer)}')

In [None]:
# Search "Properties ONLY" for Keywords
keywords = ["election"]
ind = 0
ind_holder = []
for prop in df_all['properties']:
    dup = False
    for word in keywords:
        if (word in prop) and (dup == False):
            ind_holder.append(ind)
            dup = True
    ind += 1    
    
for ind in ind_holder:
    for key in super_list[ind].keys():
        if key != "url" and key != "properties":
            k = key 
        tab = super_list[ind][k][0]
    display(HTML(tab.to_html()))