In [None]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2
import sys, os
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from pathlib import Path
from urllib.request import urlopen
from urllib.request import urlretrieve
from tqdm import tnrange, tqdm, tqdm_notebook

In [None]:
# !pip install sqlalchemy
# !pip install python-dotenv
# !pip install psycopg2
# !pip install pandas

In [None]:
load_dotenv()
user = os.getenv('HS_DB_USERNAME')
password = os.getenv('HS_DB_PASSWORD')
host = os.getenv('HS_DB_HOSTNAME')
db = os.getenv('HS_DB_DATABASE')

In [None]:
connectionString = 'postgresql+psycopg2://{}:{}@{}:5432/{}'.format(user, password, host, db)

In [None]:
engine = create_engine(connectionString)
print(f'{connectionString} Connected!')

The next line is to select the image url from the our database. 'thumbnailUrl' is the small image 70x70. Now we are trying to see if the bigger image can help improve the training accuracy.

In [None]:
# query = "select json_array_elements(details->'images')->>'caption' as caption, json_array_elements(details->'images')->>'thumbnailUrl' as image from eps_properties where details is not null and details->'images' is not null limit 200000"
query = "select json_array_elements(details->'images')->>'caption' as caption, json_array_elements(details->'images')->>'url' as image from eps_properties where details is not null and details->'images' is not null limit 20000"

In [None]:
# df_models = pd.read_sql(query, con=conn)
sql_reader = pd.read_sql(query, engine, chunksize=2000)

In [None]:
image_csv_out = 'images.csv'
image_base_path = './m_images/'
if not os.path.exists(image_base_path):
    os.makedirs(image_base_path)

In [None]:

# image_base_path = './hotel_images/'
exist=not os.path.exists(image_csv_out)
with open(image_csv_out, 'a', newline='\n') as f:
    for chunk in sql_reader:
        chunk['caption']=chunk['caption'].str.lower()
        chunk.to_csv(f, header=exist, index=False, encoding='utf-8')

In [None]:
preprocessed_data = False
if os.path.exists(image_csv_out):
    csv_data = \
        pd.read_csv(image_csv_out, usecols=['caption', 'image'], \
                    dtype= { "caption": np.character, "image":np.character}, encoding='latin1')
    preprocessed_data = True

In [None]:
csv_data = csv_data[pd.notnull(csv_data['caption'])]
csv_data.head()

In [None]:
replaces = {'/' : '_', ' ' : '_'}
csv_data['caption']=csv_data['caption'].str.translate(str.maketrans(replaces))
captions = set()
captions.update(csv_data['caption'].unique())
captions=sorted(captions)
print(captions)

In [None]:
csv_data.shape

In [None]:
csv_data.to_csv(image_base_path+image_csv_out, index=False, encoding='utf-8')

Setup training, test and valid directories to be used to store corresponding images

In [None]:
for idx, caption in enumerate(captions):
    print("index is %d and value is %s" % (idx, caption))
    Path(image_base_path+'train/' + caption).mkdir(parents=True, exist_ok=True)
    Path(image_base_path+'test/' + caption).mkdir(parents=True, exist_ok=True)
    Path(image_base_path+'valid/' + caption).mkdir(parents=True, exist_ok=True)

In [None]:
from numpy.random import RandomState

In [None]:
csv_data.shape

In [None]:
randomState = RandomState()
train_data = csv_data.sample(frac=0.6, random_state=randomState)
train_data.shape

In [None]:
other_than_train_data = csv_data.loc[~csv_data.index.isin(train_data.index)]
other_than_train_data.shape

In [None]:
validation_data = other_than_train_data.sample(frac=0.5, random_state=randomState)
validation_data.shape

In [None]:
test_data = other_than_train_data.loc[~other_than_train_data.index.isin(validation_data.index)]
test_data.shape

In [None]:
train_data.to_csv(image_base_path+'train/images.csv', index=False, encoding='utf-8')
test_data.to_csv(image_base_path+'test/images.csv', index=False, encoding='utf-8')
validation_data.to_csv(image_base_path+'valid/images.csv', index=False, encoding='utf-8')

### download_images.py is the lasted tool to download images

Now we need to use download_images.py to download all the images as specified in the images.csv under each prepared directory.
The script has to be run using command prompt due to its asyn processing which can't be run with notebook.

https://leimao.github.io/blog/Python-tqdm-Multiprocessing/

https://www.tensorflow.org/tutorials/images/classification

### Reoganize folders

Collect all folders with more than 100 images to category_images directory and the rest of the images are put under unknown category



In [None]:
from distutils.dir_util import copy_tree
import shutil
import os, csv
from pathlib import Path

image_base_path = './m_images/'  
image_cat_path = './category_m_images/'
path_keys = ['valid','train','test']
VALID_COUNT = 100

def scan_file_count(path_key, image_dict):  
    path = image_base_path + path_key + '/'
    current_dict = {}
    for root, dirs, _ in os.walk(path):
        for d in dirs:
            a = str(d)
            current_dir = path + a
            count = int(len([name for name in os.listdir(current_dir) if os.path.isfile(os.path.join(current_dir, name))]))
            current_dict = image_dict.get(a)
            if current_dict == None :
                current_dict = {'name': a}
                image_dict[a] = current_dict
            current_dict[path_key]=count
            
def valid_category(item, path_keys):
    for path_key in path_keys:
        count = int(item.get(path_key))
        if count<VALID_COUNT :
#             print(path_key, count, False)
            return count
#     print(path_key, count, True)
    return count

def new_file_name(src_file):
    pieces = str(src_file).split('\\')
    length =len(pieces)
    if length < 2:
        return None
    return pieces[len(pieces)-2] + '_' + pieces[len(pieces)-1]
   
def destination_file(src_file):
    pieces = str(src_file).split('\\')
    length =len(pieces)
    if length < 2:
        return None
    return pieces[len(pieces)-1]
    
def copy_category(item, path_keys, count):
    fromBaseDirectory = image_base_path
    toDirectory = image_cat_path
    for path_key in path_keys:
        fromDirectory=fromBaseDirectory+path_key+'/'+item.get('name')
        if count > VALID_COUNT:
            print(fromDirectory, toDirectory+path_key+'/'+item.get('name'))
            copy_tree(fromDirectory, toDirectory+path_key+'/'+item.get('name'))
        elif count > 0:
            print(fromDirectory, toDirectory+path_key+'/unknown')
            copy_tree(fromDirectory, toDirectory+path_key+'/unknown')
#             for src_file in Path(fromDirectory).glob('*.jpg'):
#                  if os.path.isfile(src_file):
#                     out_file_name = new_file_name(src_file)
#                     if out_file_name is None:
#                         continue
#                     print(toDirectory+path_key+'/unknown/'+out_file_name)
#                     shutil.copy2(src_file, toDirectory+path_key+'/unknown/'+out_file_name)
        else:
            print(fromDirectory+path_key+'/'+item.get('name'), 'no copy')
        
        
def copy_category_with_same_image_count(item, path_keys, count, limit):
    fromBaseDirectory = image_base_path
    for path_key in path_keys:
        fromDirectory=fromBaseDirectory+path_key+'/'+item.get('name')
        if count > VALID_COUNT:
            toDirectory = image_cat_path+path_key+'/'+item.get('name')+'/'
            print(fromDirectory, toDirectory)
            os.makedirs(os.path.dirname(toDirectory), exist_ok=True)
            counter = 0
            for src_file in Path(fromDirectory).glob('*.jpg'):
                 if os.path.isfile(src_file):
                    des_file = destination_file(src_file)
                    if des_file is None:
                        continue
                    shutil.copy(src_file, toDirectory+des_file)
                    counter = counter + 1
                    if counter > limit:
                        break

In [None]:
image_dict = {}
for path_key in path_keys:
    scan_file_count(path_key, image_dict)

values = image_dict.values()

If need unknow directory use the unknown lines below

In [None]:
# os.makedirs(os.path.dirname(image_cat_path+'/valid/unknown/'), exist_ok=True)
# os.makedirs(os.path.dirname(image_cat_path+'/category_images/train/unknown/'), exist_ok=True)
# os.makedirs(os.path.dirname(image_cat_path+'/test/unknown/'), exist_ok=True)

os.makedirs(os.path.dirname(image_cat_path+'/valid/'), exist_ok=True)
os.makedirs(os.path.dirname(image_cat_path+'/train/'), exist_ok=True)
os.makedirs(os.path.dirname(image_cat_path+'/test/'), exist_ok=True)

In [None]:
csv_columns = ['name','valid','train','test']
print('./m_image_num.csv')
with open('./m_image_num.csv', 'w', encoding='utf-8', newline='')  as output_file:
    dict_writer = csv.DictWriter(output_file, fieldnames=csv_columns)
    dict_writer.writeheader()
    dict_writer.writerows(values)      


In [None]:
for idx, value in enumerate(values):
#     copy_category(value, path_keys, valid_category(value, path_keys))
    copy_category_with_same_image_count(value, path_keys, valid_category(value, path_keys), 100)
#     print(idx)