In [None]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2
import sys, os
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from pathlib import Path
from urllib.request import urlopen
from urllib.request import urlretrieve
from tqdm import tnrange, tqdm, tqdm_notebook

In [None]:
load_dotenv()
user = os.getenv('ANALYTIC_DB_USERNAME')
password = os.getenv('ANALYTIC_DB_PASSWORD')
host = os.getenv('ANALYTIC_DB_HOSTNAME')
db = os.getenv('ANALYTIC_DB_DATABASE')

In [None]:
connectionString = 'postgresql+psycopg2://{}:{}@{}:5432/{}'.format(user, password, host, db)

In [None]:
engine = create_engine(connectionString)
print(f'{connectionString} Connected!')

In [None]:
query = "select json_array_elements(details->'images')->>'caption' as caption, json_array_elements(details->'images')->>'thumbnailUrl' as image from eps_properties where details is not null and details->'images' is not null limit 200000"

In [None]:
# df_models = pd.read_sql(query, con=conn)
sql_reader = pd.read_sql(query, engine, chunksize=2000)

In [None]:
exist=not os.path.exists('images.csv')
with open('images.csv', 'a', newline='\n') as f:
    for chunk in sql_reader:
        chunk['caption']=chunk['caption'].str.lower()
        chunk.to_csv(f, header=exist, index=False, encoding='utf-8')

In [None]:
preprocessed_data = False
if os.path.exists('images.csv'):
    csv_data = \
        pd.read_csv('images.csv', usecols=['caption', 'image'], \
                    dtype= { "caption": np.character, "image":np.character}, encoding='latin1')
    preprocessed_data = True

In [None]:
csv_data = csv_data[pd.notnull(csv_data['caption'])]
csv_data.head()

In [None]:
replaces = {'/' : '_', ' ' : '_'}
csv_data['caption']=csv_data['caption'].str.translate(str.maketrans(replaces))
captions = set()
captions.update(csv_data['caption'].unique())
captions=sorted(captions)
print(captions)

In [None]:
csv_data.shape

In [None]:
csv_data.to_csv('./hotel_images/images.csv', index=False, encoding='utf-8')

In [None]:
for idx, caption in enumerate(captions):
    print("index is %d and value is %s" % (idx, caption))
    Path('./hotel_images/train/' + caption).mkdir(parents=True, exist_ok=True)
    Path('./hotel_images/test/' + caption).mkdir(parents=True, exist_ok=True)
    Path('./hotel_images/valid/' + caption).mkdir(parents=True, exist_ok=True)

In [None]:
from numpy.random import RandomState

In [None]:
csv_data.shape

In [None]:
randomState = RandomState()
train_data = csv_data.sample(frac=0.6, random_state=randomState)
train_data.shape

In [None]:
other_than_train_data = csv_data.loc[~csv_data.index.isin(train_data.index)]
other_than_train_data.shape

In [None]:
validation_data = other_than_train_data.sample(frac=0.5, random_state=randomState)
validation_data.shape

In [None]:
test_data = other_than_train_data.loc[~other_than_train_data.index.isin(validation_data.index)]
test_data.shape

In [None]:
train_data.to_csv('./hotel_images/train/images.csv', index=False, encoding='utf-8')
test_data.to_csv('./hotel_images/test/images.csv', index=False, encoding='utf-8')
validation_data.to_csv('./hotel_images/valid/images.csv', index=False, encoding='utf-8')

### Please clean up all the code below this as download_images.py is the lasted tool to download images

In [None]:
def load_data(group_type):
    return \
        pd.read_csv('./hotel_images/'+group_type+'/images.csv', usecols=['caption', 'image'], \
                    dtype= { "caption": np.character, "image":np.character}, \
                    encoding='utf-8')    

In [None]:
train_data = load_data('train')
test_data = load_data('test')
validation_data = load_data('valid')

In [None]:
def download_photo(img_url, group_type, category, filename):
    try:
        file_path = "./hotelImages/%s/%s/%s" % (group_type, category, filename)
        print(file_path)
        urlretrieve(img_url, file_path)
    except:
        print(img_url)
        return False
    return True

In [None]:
train_data=train_data[train_data['image']!='image']
test_data=test_data[test_data['image']!='image']
validation_data=validation_data[validation_data['image']!='image']

In [None]:
for index, row in train_data.iterrows():
    image_name = row['image'].rsplit('/', 1)
    if len(image_name)<2:
        print(row['image'])
#         break
#     file_name = "%d.%s" % (index, image_name[1])
#     file_path = "./hotelImages/%s/%s/%s" % ('train', row['caption'], file_name)
#     print(file_path)
#     urlretrieve('http:' + row['image'], file_path)
#     break

In [None]:
from tqdm import tnrange, tqdm, tqdm_notebook
import multiprocessing as mp
from multiprocessing import Pool
mp.cpu_count()

In [None]:
# def download_image_of(row, group):
#     image_name = row['image'].rsplit('/', 1)
#     download_photo(img_url='http:' + row['image'], group_type=group, category=row['caption'], filename= image_name[1])  

In [None]:
# chunk_size = int(test_data.shape[0]/4)
# chunks = [test_data.ix[test_data.index[i:i + chunk_size]] for i in range(0, test_data.shape[0], chunk_size)]
# pool = Pool(4)
# result = pool.map(func, chunks)

In [None]:
def download_photo_row(index, row):
    image_name = row['image'].rsplit('/', 1)
    return download_photo(img_url='http:' + row['image'], group_type='valid', category=row['caption'], filename= "%d.%s" % (index, image_name[1]))

def progressive_download_photo(pbar, index, row):
    download_photo_row(index, row)
    pbar.update(1)
    
def progressive_download_test2(index, row, p):
    print(index)
    pbar.update(1)    
    return index

def progressive_download_test(pbar, index, row):
    pbar.update(1)    
    return index

https://leimao.github.io/blog/Python-tqdm-Multiprocessing/

In [None]:
from functools import partial
pool = mp.Pool(processes=(mp.cpu_count()-2))
tqdm.pandas(desc="processing in progress ... %d" %(mp.cpu_count()))

with tqdm(total=len(list(validation_data.iterrows()))) as pbar:
    download_row = partial(progressive_download_test2, p=pbar)
    pool.map( download_row, [(index, row) for index, row in validation_data.head(10).iterrows()])

In [None]:
from functools import partial
pool = mp.Pool(processes=(mp.cpu_count()-2))
tqdm.pandas(desc="processing in progress ... %d" %(mp.cpu_count()))

with tqdm(total=len(list(validation_data.iterrows()))) as pbar:
    download_row = partial(progressive_download_photo, pbar)
    pool.map( download_row, [(index, row) for index, row in validation_data.iterrows()])

In [None]:
tqdm.pandas(desc="processing in progress ...")

with tqdm(total=len(list(validation_data.iterrows()))) as pbar:
    for index, row in validation_data.iterrows():
        image_name = row['image'].rsplit('/', 1)
        download_photo(img_url='http:' + row['image'], group_type='valid', category=row['caption'], filename= "%d.%s" % (index, image_name[1]))
        pbar.update(1)

In [None]:
# tqdm.pandas(desc="processing in progress ...")

# train_data.progress_apply(lambda row: download_photo(img_url='http:' + row['image'], group_type='test', category=row['caption'], filename= row['image'].rsplit('/', 1)[1]))


In [None]:
# train_data.progress_apply(lambda x: download_image_of(x, 'train'))

for index, row in train_data.iterrows():
    image_name = row['image'].rsplit('/', 1)
    download_photo(img_url='http:' + row['image'], group_type='test', category=row['caption'], filename= "%d.%s" % (index, image_name[1])

https://www.tensorflow.org/tutorials/images/classification