In [2]:
# A few details about the platform and the machine on which the computation is done. 
import platform
from __future__ import print_function

print('OS Details: ', platform.platform())
print('Version: ', platform.version())
print('Platform: ', platform.machine())

OS Details:  Linux-4.4.8-040408-generic-x86_64-with-Ubuntu-16.04-xenial
Version:  #201604200335 SMP Wed Apr 20 07:37:30 UTC 2016
Platform:  x86_64


In [2]:
import os
import sys
import numpy as np
import tarfile
from six.moves.urllib.request import urlretrieve

In [3]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
    """
    A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 10% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 10 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent        
        
def maybe_download(filename, force=False):
    """
    Download a file, if not present and make sure it's of the right size.
    """
    if force or not os.path.exists(filename):
        print('Attempting to download: ', filename) 
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    else:
        print(filename, 'is already downloaded. Skipped.')
    return filename

In [4]:
train_file_gz = maybe_download('train.tar.gz')

Attempting to download:  train.tar.gz
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........100%
Download Complete!


In [5]:
test_file_gz = maybe_download('test.tar.gz')

Attempting to download:  test.tar.gz
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........100%
Download Complete!


In [6]:
extra_file_gz = maybe_download('extra.tar.gz')

Attempting to download:  extra.tar.gz
0%.........10%.........20%.........30%.........40%.........50%.........60%.........70%.........80%.........90%.........100%
Download Complete!


In [7]:
np.random.seed(789)

def maybe_extract(in_file, force=False):
    folder_name = os.path.splitext(os.path.splitext(in_file)[0])[0]  # except .tar.gz
    
    if os.path.isdir(folder_name) and not force:
        # Extraction may be forced by setting force = True (even though the folder is already present).
        print('%s folder is already present - skipping extraction of %s.' % (folder_name, in_file))
    else:
        print('Extracting data from file %s .. please wait...' % in_file)
        tar = tarfile.open(in_file)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
        print('File %s has been successfully extracted to %s folder.' % (in_file, folder_name))        
    
    return folder_name

In [9]:
train_folder = maybe_extract(train_file_gz)

Extracting data from file train.tar.gz .. please wait...
File train.tar.gz has been successfully extracted to train folder.


In [10]:
test_folder = maybe_extract(test_file_gz)

Extracting data from file test.tar.gz .. please wait...
File test.tar.gz has been successfully extracted to test folder.


In [11]:
extra_folder = maybe_extract(extra_file_gz)

Extracting data from file extra.tar.gz .. please wait...
File extra.tar.gz has been successfully extracted to extra folder.
