In [11]:
import pandas as pd
from tqdm import tqdm

In [12]:
tsv_file = 'data/101381_meta.tsv.gz'
df = pd.read_csv(
        tsv_file,
        compression="gzip",
        header=None,  # no header in tsv
        sep="\t",
        quotechar='"',
    )
df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,サライ の 贈り物 movelot リン ベル 30800円 コース 紅梅 母 の 日 父 ...,0/101381/566732,[ のし 対応 包装 メッセージ カード 無料 ] のし 包装 に つい て メッセージ カ...,251187,10000346,566732,90011001000011011275,,http://thumbnail.image.rakuten.co.jp/@0_mall/r...,http://item.rakuten.co.jp/ringbellshop/y895-003,...,0,1,0,2,0,20110421134213,20190202220807,730770,66528,7086


In [13]:
img_urls = df.iloc[:,8]
print(img_urls[:5])
print(len(img_urls))

0    http://thumbnail.image.rakuten.co.jp/@0_mall/r...
1    http://thumbnail.image.rakuten.co.jp/@0_mall/o...
2    http://thumbnail.image.rakuten.co.jp/@0_mall/p...
3    http://thumbnail.image.rakuten.co.jp/@0_mall/s...
4    http://thumbnail.image.rakuten.co.jp/@0_mall/p...
Name: 8, dtype: object
9726


In [14]:
import os
import requests
import base64
from time import time as timer

In [15]:
def url2path(url):
    byte_str = base64.b64encode(url.encode())
    _str = byte_str.decode('utf-8')
    return _str

url2path(img_urls[0])

'aHR0cDovL3RodW1ibmFpbC5pbWFnZS5yYWt1dGVuLmNvLmpwL0AwX21hbGwvcmluZ2JlbGxzaG9wL2NhYmluZXQvY2F0ZS9zZXJhaS95ODk1LTAwMy5qcGc='

In [16]:
def fetch_url(url):
    path = 'data/imgs/' + url2path(url) + '.jpg' 
    if not os.path.exists(path):
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open(path, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
    return path

## Single process

In [17]:

start = timer()
for entry in tqdm(img_urls[:500]):
    fetch_url(entry)

print(f"\nElapsed Time: {timer() - start}")

100%|██████████| 500/500 [01:25<00:00,  5.88it/s]
Elapsed Time: 85.00655007362366



## ThreadPool to use 16 concurrent processes

In [18]:
from multiprocessing.pool import ThreadPool, Pool

# pool = ThreadPool(16)
# it looks that there has no difference of using Pool or ThreadPool
pool = Pool(16)

start = timer()
result_list = []
for res in tqdm(pool.imap_unordered(fetch_url, img_urls[501:1000]), total=500):
    result_list.append(res)

print(f"\nElapsed Time: {timer() - start}")

100%|█████████▉| 499/500 [00:10<00:00, 46.95it/s]
Elapsed Time: 10.645941019058228

