In [1]:
from pyquery import PyQuery as pq
import re
import json
import pandas as pd

In [2]:
def flatten(arr):
    return [val for sublist in arr for val in sublist]
def clean_term(term):
    if term is None:
        return "";
    term = re.sub("</div>\n<div>", "\n", term)
    term = re.sub("\xa0", "", term)
    term = re.sub("\*", "", term)
    term = re.sub("</div>\n<div>", "\n", term)
    term = re.sub("(</div>)|(<div.*>)", "", term)
    term = re.sub("<span.*>.*<\/span>", "", term)
    term = re.sub("</*p/*>", "", term)
    term = re.sub("<b>.*<\/b>", "", term)
    term = re.sub("<br\/*>", "\n", term)
    term = re.sub("- ", "", term)
    return term.strip()
def fetch_terms(filename, multicol=0):
    with open("data/"+filename) as spine:
        doc = pq(spine.read())
        terms = [clean_term(row("td").eq(0).html()) for row in list(doc("tr").items())[1:]]
        if(multicol):
            terms = terms + ([clean_term(row("td").eq(multicol).html()) for row in list(doc("tr").items())[1:]])
        terms = flatten([term.split("\n") for term in terms if term != ""])
        terms = [term for term in terms if term != ""]
        return terms
def gen_frame(filename, lesson, multicol=0):
    terms = fetch_terms(filename, multicol)
    return pd.DataFrame({"term": terms, "lesson": [lesson]*len(terms)})
def get_imgurls(term):
    uagent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'
    term = "human anatomy "+term
    query= term.split()
    query='+'.join(query)
    return ";".join([json.loads(a.html())["ou"] for a in pq("https://www.google.com/search?tbm=isch&source=lnms&q="+query, headers={'user-agent': uagent})(".rg_meta").items()][0:4])

In [3]:
output = pd.concat([
    gen_frame("spine.html", "Spine"),
    gen_frame("thoracic_wall.html", "Thoracic Wall"),
    gen_frame("lungs.html", "Lungs", 3),
    gen_frame("heart.html", "Heart", 2),
    gen_frame("mediastinum.html", "Superior and Posterior Mediastinum", 3)
]).reset_index(drop=True)
output["imgurls"] = [get_imgurls(term) for term in output["term"]]
output["id"] = output.index
output.to_json("terms.json", orient="records")#.to_csv("terms.csv", index=False)

In [4]:
output

Unnamed: 0,lesson,term,imgurls,id
0,Spine,arachnoid mater,https://i.ytimg.com/vi/cakxjODx3q0/maxresdefau...,0
1,Spine,atlas,https://lh3.googleusercontent.com/wuh5ogaPq-PV...,1
2,Spine,axis,http://www.quinticsports.com/wp-content/upload...,2
3,Spine,body of vertebra,http://www.medicalook.com/systems_images/Verte...,3
4,Spine,cauda equina,https://ittcs.files.wordpress.com/2010/06/img_...,4
5,Spine,cervical vertebra,https://i.pinimg.com/736x/36/07/c2/3607c20eae8...,5
6,Spine,thoracic vertebra,http://www.likefigures.com/media/catalog/produ...,6
7,Spine,lumbar vertebra,https://img.medscapestatic.com/pi/meds/ckb/57/...,7
8,Spine,sacrum,http://www.medicalook.com/systems_images/Sacru...,8
9,Spine,conus medullaris,https://i.ytimg.com/vi/WXRjx_xX68w/maxresdefau...,9
