# Datascraping

Collect urls of images for experiments using Bing.

## Cencepts from paper

Concepts from Table 1 in [Measuring Semantic Similarity between Concepts in Visual Domain](http://ieeexplore.ieee.org/document/4665152/).  
Concepts are corresponding to classes in our paper.

In [1]:
concepts = [
    "bay",
    "beach",
    "birds",
    "boeing",
    "buildings",
    "city",
    "clouds",
    "face",
    "f-16",
    "helicopter",
    "mountain",
    "sky",
    "ships",
    "sunset",
    "sunrise",
    "ocean"
]

In [2]:
len(concepts)

16

## Collect urls of images using Bing

In [3]:
import json
import time

In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

In [5]:
chrome_options = Options()
chrome_options.binary_location = '/usr/bin/google-chrome-stable'
chrome_options.add_argument("--headless")

# chrome_options.add_argument('--disable-gpu')

# fix for chrome not reachable
chrome_options.add_argument('--no-sandbox')

In [6]:
URL_TEMPLATE = "http://www.bing.com/images/search?q={}&FORM=IGRE"

In [7]:
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="/usr/local/bin/chromedriver")
driver.maximize_window()

Define several functions.

In [8]:
def find_index(atags, hval):
    for i in range(len(atags)):
        if atags[i].get_attribute('h') == hval:
            return i
    return -1

In [9]:
def get_url(atag):
    return json.loads(atag.get_attribute("m"))['murl']

def scroll_to(elem):
    ActionChains(driver).move_to_element(elem).perform()
    
def find_all_image_atags():
    return driver.find_elements_by_class_name("iusc")

In [10]:
def get_urls(atags, after_hval = None):
    begin = 0
    res = []
    if after_hval != None:
        begin = find_index(atags, after_hval)+1
        if begin == 0:
            raise Exception("No hval found: {}".format(after_hval))
    for i in range(begin, len(atags)):
        res.append(get_url(atags[i]))
    lasthval = atags[len(atags)-1].get_attribute('h')
    return (lasthval, res)

In [11]:
from selenium.common.exceptions import NoSuchElementException

def collect_all_urls(limit=100):
    res_list = []
    last_hval = None
    loop_limit = limit
    loop_count = 0
    while len(res_list) < limit:
        atags = find_all_image_atags()
        last_hval_cand, urls = get_urls(atags, last_hval)
        if len(urls) == 0:
            try :
                driver.find_element_by_partial_link_text('See more images').click()
            except NoSuchElementException:
                print("reach to last image")
                return res_list
            time.sleep(3)
            atags = find_all_image_atags()
            last_hval_cand, urls = get_urls(atags, last_hval)
            if len(urls) == 0:
                print("No new image urls. Somehing wrong happens")
                return res_list
        last_hval = last_hval_cand
        res_list.extend(urls)
        scroll_to(atags[len(atags)-1])
        if loop_count > loop_limit:
            print("Too much loop, something wrong happens.")
            return res_list
        loop_count += 1
        print(len(res_list))
        time.sleep(1)
    return res_list

In [12]:
def collect_1100_images(cat):
    top_url = URL_TEMPLATE.format(cat)
    driver.get(top_url)
    time.sleep(1)
    return collect_all_urls(1100)

In [13]:
def save_urls(urls, category):
    with open("urls/{}.txt".format(category), "w", encoding='utf8') as f:
        for url in urls:
            f.write(url+"\n")

### Collect image urls

In [14]:
for cat in concepts:
    urls = collect_1100_images(cat)
    save_urls(urls, cat)

35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
970
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
980
1000
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
980
992
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
792
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
886
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
980
996
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
839
reach to last image
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
980
989
reach t

### Remove duplications

In [15]:
def read_urls(cat):
    with open("urls/{}.txt".format(cat), encoding='utf8') as f:
        lines = f.readlines()
    return [line.rsplit('\n')[0] for line in lines]

In [16]:
# keep order and remove dup
def uniq_list(urllist):
    done = set()
    ret = []
    for url in urllist:
        if not url in done:
            ret.append(url)
            done.add(url)
    return ret

In [17]:
for cat in concepts:
    urls = read_urls(cat)
    new_urls = uniq_list(urls)
    save_urls(new_urls, cat)

Check using a single class.

In [18]:
urls = read_urls("birds")

In [19]:
len(urls)

992

In [20]:
len(uniq_list(urls))

992