# Agtuary ML Challenge - 1. Data Extraction

This notebook downloads the raw data for the [Agtuary ML Challenge](https://github.com/agtuary/machine-learning-challenge). The downloaded TAR archive is extracted into a subfolder named 'data' where it can be accessed for analysis, model training and inference in later notebooks.

In [1]:
import os
import glob
import tarfile
import urllib.request

In [2]:
# variables for data weblink and filename
AGTUARY_URL = "https://agtuary-data-public.s3.ap-southeast-2.amazonaws.com/machine-learning-challenge/agtuary-ml.tar.gz"
TAR_FILE = "agtuary-ml.tar.gz"

In [3]:
# get the current working directory
path = os.getcwd()
print(path)

/home/lachy/dml/agtuary-machine-learning-challenge-master


In [4]:
# download data from link
def download_data(url, filename):
    print(f"Checking if '{filename}' already exists in current directory...")
    file_path = os.path.join(path, TAR_FILE)
    if os.path.exists(file_path) is True:
        print(f"Cannot download: '{filename}' already exists.")
    else:
        print(f"Downloading '{filename}' from {url}")
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")

In [5]:
# make subfolder to store data
def make_subfolder(subfolder_path):
    if not os.path.exists(subfolder_path):
        os.mkdir(subfolder_path)
        print(f"Created subfolder '{subfolder_path}'")
    else:
        print(f"Cannot create directory: '{subfolder_path}' already exists.")

In [6]:
# extract tar archive to empty directory
def extract_tar_to_dir(tar_file, extract_dir):
    tar_path = os.path.join(os.getcwd(), tar_file)
    if len(os.listdir(extract_dir)) == 0:
        with tarfile.open(tar_path) as tar:
            tar.extractall(extract_dir)
        print(f"'{tar_file}' extracted to '{extract_dir}'")
    else:
        print(f"Cannot extract: '{extract_dir}' is not empty.")

In [7]:
%%time
# download data from link
download_data(url=AGTUARY_URL, filename=TAR_FILE)

Checking if 'agtuary-ml.tar.gz' already exists in current directory...
Downloading 'agtuary-ml.tar.gz' from https://agtuary-data-public.s3.ap-southeast-2.amazonaws.com/machine-learning-challenge/agtuary-ml.tar.gz
Download complete.
CPU times: user 1.96 s, sys: 1.7 s, total: 3.66 s
Wall time: 52 s


In [10]:
# create subfolder for data
data_dir = os.path.join(path, "data_raw")
make_subfolder(data_dir)

Created subfolder '/home/lachy/dml/agtuary-machine-learning-challenge-master/data_raw'


In [11]:
%%time
# extract contents to data subfolder
extract_tar_to_dir(TAR_FILE, data_dir)

'agtuary-ml.tar.gz' extracted to '/home/lachy/dml/agtuary-machine-learning-challenge-master/data_raw'
CPU times: user 1.65 s, sys: 310 ms, total: 1.96 s
Wall time: 1.96 s
