In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import joblib
import numpy as np
import pandas as pd
import re
import os
import time

from PIL import Image, ImageFilter
import requests

pd.set_option('max_colwidth', 100)
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
# https://machinelearningmastery.com/how-to-load-and-manipulate-images-for-deep-learning-in-python-with-pil-pillow/

#https://realpython.com/python-requests/#timeouts

In [3]:
#Import cleaned dataframe.
vehicle=joblib.load('vehicleClean.joblib')

In [4]:
vehicle.head(2)

Unnamed: 0,URL_Vehicle,Title,Location,Year,Year_in_Title,Odometer,RawMake,Make,Model,Trim,Seller,Price,SubLoc,Body,AttribDictionary,ImageDictionary,VehicleID,VIN,LN_Price,Model_Trim,Model_Seller,Trim_Seller
1,https://austin.craigslist.org/cto/d/red-rock-2006-ford-focus-se/7022331083.html,2006 Ford Focus SE,austin,2006,,,2006 ford focus se,ford,focus,se,owner,3000.0,(Cedar Creek),"[\n , \n2006 Ford Focus in great mechanical condition-AC/Heater work great-Tires have som...","{'0': '<span><b>2006 Ford Focus SE</b></span>', '1': '<span>fuel: <b>gas</b></span>', '2': '<spa...","{'0': 'https://images.craigslist.org/00n0n_dXTq6r46TL4_600x450.jpg', '1': 'https://images.craigs...",7022331083,,8.006701,focus_se,focus_owner,se_owner
2,https://austin.craigslist.org/cto/d/marble-falls-04-ford-f450-service-truck/7022347414.html,‘04 FORD F450 SERVICE TRUCK,austin,2004,,100000.0,2004 ford f450,ford,f-450,,owner,8950.0,(Marble Falls),"[\n , \nThis is a really nice dependable truck that we’ve been using daily for over a yea...","{'0': '<span><b>2004 FORD F450</b></span>', '1': '<span>condition: <b>excellent</b></span>', '2'...","{'0': 'https://images.craigslist.org/01717_30HqNTmEVww_600x450.jpg', '1': 'https://images.craigs...",7022347414,,9.099521,f-450_None,f-450_owner,None_owner


In [5]:
vehicle[['ImageDictionary']].head(2)

Unnamed: 0,ImageDictionary
1,"{'0': 'https://images.craigslist.org/00n0n_dXTq6r46TL4_600x450.jpg', '1': 'https://images.craigs..."
2,"{'0': 'https://images.craigslist.org/01717_30HqNTmEVww_600x450.jpg', '1': 'https://images.craigs..."


In [6]:
#Insert VehicleID from URL_Vehicle
idList=[]
for idx in vehicle.index:
    m = re.search('/(\d+)\.html', vehicle.loc[idx, 'URL_Vehicle']) #re.search(pattern, string) gets first match
    if m is not None:
        idList.append(m.group(1).lower())
    else:
        print('Missing Make/Model index:', idx)
        idList.append('None')
                 
vehicle['VehicleID'] = idList

In [7]:
vehicle.head(2)

Unnamed: 0,URL_Vehicle,Title,Location,Year,Year_in_Title,Odometer,RawMake,Make,Model,Trim,Seller,Price,SubLoc,Body,AttribDictionary,ImageDictionary,VehicleID,VIN,LN_Price,Model_Trim,Model_Seller,Trim_Seller
1,https://austin.craigslist.org/cto/d/red-rock-2006-ford-focus-se/7022331083.html,2006 Ford Focus SE,austin,2006,,,2006 ford focus se,ford,focus,se,owner,3000.0,(Cedar Creek),"[\n , \n2006 Ford Focus in great mechanical condition-AC/Heater work great-Tires have som...","{'0': '<span><b>2006 Ford Focus SE</b></span>', '1': '<span>fuel: <b>gas</b></span>', '2': '<spa...","{'0': 'https://images.craigslist.org/00n0n_dXTq6r46TL4_600x450.jpg', '1': 'https://images.craigs...",7022331083,,8.006701,focus_se,focus_owner,se_owner
2,https://austin.craigslist.org/cto/d/marble-falls-04-ford-f450-service-truck/7022347414.html,‘04 FORD F450 SERVICE TRUCK,austin,2004,,100000.0,2004 ford f450,ford,f-450,,owner,8950.0,(Marble Falls),"[\n , \nThis is a really nice dependable truck that we’ve been using daily for over a yea...","{'0': '<span><b>2004 FORD F450</b></span>', '1': '<span>condition: <b>excellent</b></span>', '2'...","{'0': 'https://images.craigslist.org/01717_30HqNTmEVww_600x450.jpg', '1': 'https://images.craigs...",7022347414,,9.099521,f-450_None,f-450_owner,None_owner


In [8]:
#Extract images to imDict: {'VehicleID': {'EntryNum' : 'imFName'}
imDict={}
for idx in vehicle.index:
    vehID = vehicle.loc[idx, 'VehicleID']
    imDict[vehID]={}
    for k in vehicle.loc[idx, 'ImageDictionary'].keys():
        m = re.search('org/(\S+)\.jpg', vehicle.loc[idx, 'ImageDictionary'][str(k)]) #re.search(pattern, string) gets first match    
        imDict[vehID][k] = m.group(1)

In [9]:
joblib.dump(imDict, 'imDict.joblib', compress=0)
print(len(vehicle))
print(len(imDict))

7343
7343


###  Download images using imDict

In [None]:
#Fetch URLs from imDict.  Open URL, get image filename, and save to disk.
#Save to: Z:\DATA_SCIENCE\car_images\{VehicleID}\{imFName}

vehCount = 0
imCount = 0
errorDict={}

for vehID in imDict.keys():  #get top key--vehID
    vehCount += 1
    try:
        os.mkdir(f'Z:/DATA_SCIENCE/car_images/{vehID}')     #make vehicleID folder
    except:
        print(f'Error making directory.  Directory {vehID} exists.')
        continue
    short_sleep_sec = np.random.randint(low=1, high=3)
    time.sleep(short_sleep_sec)
    
    for val in imDict[vehID].values():  #get second level value--the image name
        imCount += 1
        URL = f'https://images.craigslist.org/{val}.jpg'
        
        #Sleep every 450 images
        if imCount % 450 == 0:
            sleep_sec = np.random.randint(low=61, high=140)
            print(f'Sleeping...  Current imCount:', imCount)
            time.sleep(sleep_sec)        
          
        try:
            #Open image. Timemout is: (connect timeout, read timeout) in seconds.
            im = Image.open(requests.get(URL, stream=True, timeout=(5.15, 9.5)).raw)    
        except requests.exceptions.Timeout:
            print(f'Timout error.', vehID, val)
            errorDict[vehID] = (val,'timeout_error')
            joblib.dump(errorDict, 'errorDict.joblib', compress=0)
        except:
            print(f'Non-timeout error.', vehID, val)
            errorDict[vehID] = (val,'non-timeout_error')
            joblib.dump(errorDict, 'errorDict.joblib', compress=0)
            
        imFName = re.search('org/(\S+)\.jpg', URL).group(1) + '.png'    #get image filename from URL and add png.
        try:
            im.save(f'Z:/DATA_SCIENCE/car_images/{vehID}/{imFName}', format='PNG')  #save im to path
        except:
            print('Saving error.')
        
print('VehicleID vehCount:', vehCount)

Error making directory.  Directory 7022331083 exists.
Error making directory.  Directory 7022347414 exists.
Error making directory.  Directory 7022290197 exists.
Error making directory.  Directory 7022393859 exists.
Error making directory.  Directory 7022376582 exists.
Error making directory.  Directory 7021342410 exists.
Error making directory.  Directory 7021388137 exists.
Error making directory.  Directory 7021390767 exists.
Error making directory.  Directory 7021413592 exists.
Error making directory.  Directory 7021416075 exists.
Error making directory.  Directory 7021443320 exists.
Error making directory.  Directory 7021454505 exists.
Error making directory.  Directory 7021477786 exists.
Error making directory.  Directory 7021498834 exists.
Error making directory.  Directory 7021535769 exists.
Error making directory.  Directory 7021643460 exists.
Error making directory.  Directory 7021672366 exists.
Error making directory.  Directory 7021684777 exists.
Error making directory.  Dir

Error making directory.  Directory 7014590092 exists.
Error making directory.  Directory 7014640661 exists.
Error making directory.  Directory 7014608746 exists.
Error making directory.  Directory 7014640762 exists.
Error making directory.  Directory 7014687059 exists.
Error making directory.  Directory 7014693611 exists.
Error making directory.  Directory 7014697824 exists.
Error making directory.  Directory 7014711045 exists.
Error making directory.  Directory 7014727797 exists.
Error making directory.  Directory 7014741611 exists.
Error making directory.  Directory 7014748521 exists.
Error making directory.  Directory 7014759233 exists.
Error making directory.  Directory 7014753661 exists.
Error making directory.  Directory 7014769054 exists.
Error making directory.  Directory 7014005805 exists.
Error making directory.  Directory 7014005509 exists.
Error making directory.  Directory 7014011391 exists.
Error making directory.  Directory 7014011584 exists.
Error making directory.  Dir

Error making directory.  Directory 7004894075 exists.
Error making directory.  Directory 7005014356 exists.
Error making directory.  Directory 7005051927 exists.
Error making directory.  Directory 7005073977 exists.
Error making directory.  Directory 7003794575 exists.
Error making directory.  Directory 7003882550 exists.
Error making directory.  Directory 7003886871 exists.
Error making directory.  Directory 7003886907 exists.
Error making directory.  Directory 7003031920 exists.
Error making directory.  Directory 7003200633 exists.
Error making directory.  Directory 7003251758 exists.
Error making directory.  Directory 7003249078 exists.
Error making directory.  Directory 7003252671 exists.
Error making directory.  Directory 7003293506 exists.
Error making directory.  Directory 7003413735 exists.
Error making directory.  Directory 7003487305 exists.
Error making directory.  Directory 7003576477 exists.
Error making directory.  Directory 7003544016 exists.
Error making directory.  Dir

Error making directory.  Directory 7006163068 exists.
Error making directory.  Directory 7006199088 exists.
Error making directory.  Directory 7006212489 exists.
Error making directory.  Directory 7006278059 exists.
Error making directory.  Directory 7006618142 exists.
Error making directory.  Directory 7006628624 exists.
Error making directory.  Directory 7006640723 exists.
Error making directory.  Directory 7006666448 exists.
Error making directory.  Directory 7006673765 exists.
Error making directory.  Directory 7006681590 exists.
Error making directory.  Directory 7006697239 exists.
Error making directory.  Directory 7006673737 exists.
Error making directory.  Directory 7006673932 exists.
Error making directory.  Directory 7006697761 exists.
Error making directory.  Directory 7006681352 exists.
Error making directory.  Directory 7006723920 exists.
Error making directory.  Directory 7006726011 exists.
Error making directory.  Directory 7006727637 exists.
Error making directory.  Dir

Error making directory.  Directory 7011677496 exists.
Error making directory.  Directory 7011695877 exists.
Error making directory.  Directory 7011697775 exists.
Error making directory.  Directory 7011742907 exists.
Error making directory.  Directory 7011815101 exists.
Error making directory.  Directory 7011818640 exists.
Error making directory.  Directory 7011819763 exists.
Error making directory.  Directory 7011853480 exists.
Error making directory.  Directory 7011913842 exists.
Error making directory.  Directory 7011922919 exists.
Error making directory.  Directory 7011939575 exists.
Error making directory.  Directory 7011950908 exists.
Error making directory.  Directory 7011952924 exists.
Error making directory.  Directory 7011984571 exists.
Error making directory.  Directory 7011984706 exists.
Error making directory.  Directory 7011985009 exists.
Error making directory.  Directory 7011985156 exists.
Error making directory.  Directory 7011985313 exists.
Error making directory.  Dir

Error making directory.  Directory 7016651728 exists.
Error making directory.  Directory 7016651832 exists.
Error making directory.  Directory 7016651757 exists.
Error making directory.  Directory 7016643496 exists.
Error making directory.  Directory 7016671648 exists.
Error making directory.  Directory 7016672469 exists.
Error making directory.  Directory 7016688494 exists.
Error making directory.  Directory 7016683138 exists.
Error making directory.  Directory 7016692598 exists.
Error making directory.  Directory 7016697216 exists.
Error making directory.  Directory 7016698134 exists.
Error making directory.  Directory 7016699136 exists.
Error making directory.  Directory 7016699396 exists.
Error making directory.  Directory 7016701967 exists.
Error making directory.  Directory 7016723725 exists.
Error making directory.  Directory 7016746486 exists.
Error making directory.  Directory 7016748041 exists.
Error making directory.  Directory 7016768337 exists.
Error making directory.  Dir

Error making directory.  Directory 7019893723 exists.
Error making directory.  Directory 7019910194 exists.
Error making directory.  Directory 7019941798 exists.
Error making directory.  Directory 7019950279 exists.
Error making directory.  Directory 7019968135 exists.
Error making directory.  Directory 7019973494 exists.
Error making directory.  Directory 7019990030 exists.
Error making directory.  Directory 7019990091 exists.
Error making directory.  Directory 7020008767 exists.
Error making directory.  Directory 7020012626 exists.
Error making directory.  Directory 7020403049 exists.
Error making directory.  Directory 7020439572 exists.
Error making directory.  Directory 7020452645 exists.
Error making directory.  Directory 7020460238 exists.
Error making directory.  Directory 7020461028 exists.
Error making directory.  Directory 7020498785 exists.
Error making directory.  Directory 7020498863 exists.
Error making directory.  Directory 7020556334 exists.
Error making directory.  Dir

Error making directory.  Directory 7022010851 exists.
Error making directory.  Directory 7021742016 exists.
Error making directory.  Directory 7021746353 exists.
Error making directory.  Directory 7021763549 exists.
Error making directory.  Directory 7021771939 exists.
Error making directory.  Directory 7021534947 exists.
Error making directory.  Directory 7021759570 exists.
Error making directory.  Directory 7021495594 exists.
Error making directory.  Directory 7021476193 exists.
Error making directory.  Directory 7021478046 exists.
Error making directory.  Directory 7021495886 exists.
Error making directory.  Directory 7021490479 exists.
Error making directory.  Directory 7021493060 exists.
Error making directory.  Directory 7021510137 exists.
Error making directory.  Directory 7021513905 exists.
Sleeping...  Current imCount: 450
Non-timeout error. 7021246115 00s0s_8eL1wC819zh_600x450
Non-timeout error. 7021271652 00909_cDDunpKdPGE_600x450
Sleeping...  Current imCount: 900
Non-timeou

In [None]:
# count=0
# for idx in vehicle.index:
#     count+=len(vehicle.loc[idx, 'ImageDictionary'])
# count 
   

In [None]:
# 126617/6990

In [None]:
# count = 0
# for i in range(0,52):
#     print(count)
#     count += 1
#     if count % 10 == 0:
#         sleep_sec = np.random.randint(low=5, high=12)
#         time.sleep(sleep_sec)
#         print('time slept:', sleep_sec)

In [None]:
# imConverted = Image.open(f'Z:\DATA_SCIENCE\car_images\{imFName}')
# print('imConverted format:', imConverted.format_description, '\nimConvertedage mode:', imConverted.mode, '\nimConverted size:', imConverted.size)

In [None]:
# #Open image from URL. Save as PNG.
# URL = 'https://images.craigslist.org/00t0t_79tmoKPwmNV_600x450.jpg'
# im = Image.open(requests.get(URL, stream=True).raw)
# imFName = re.search('org/(\S+)\.jpg', URL).group(1) + '.png' #re.search(pattern, string) gets first match
# im.save(f'Z:\DATA_SCIENCE\car_images\{imFName}', format='PNG')

In [None]:
# myurl = 'https://images.craigslist.org/00t0t_79tmoKPwmNV_600x450.jpg'
# im = Image.open(urllib.request.urlopen(myurl))