## Download and extract zip from web

- Specifies the source link, destination url and file name to download and extract data files
- Currently reading from external folder as github does not support large files
    - To rerun function for testing before submission
    - To add checks and conditions for the function
- Link to zip download here: "https://s3-ap-southeast-1.amazonaws.com/grab-aiforsea-dataset/safety.zip" 

In [1]:
import zipfile
import urllib.request
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
SOURCE = "https://s3-ap-southeast-1.amazonaws.com/grab-aiforsea-dataset/safety.zip"
OUTPUT_PATH = "/Users/jiayihuang/Desktop/Jtsw/grab-ai-safety/data"
FILE_NAME = ""

In [3]:
class DownloadProgressBar(tqdm):
    '''Class for tqdm progress bar.'''
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def maybe_download(url, output_path, dest_file_name):
    '''Function that checks the validity of a desired URL,
    downloads and extracts a ZIP file for the purposes of
    the Grab AI challenge.
    
    Args:
    url (str): Download path of the dataset in question
    output_path(str): path of the desired download destination
    dest_file_name(str): Desired file name. 
    To include .zip extension
    
    Returns:
    None.
    Extracts all relevant data files into a desired folder for
    download.
    '''
    full_path = output_path+'/'+dest_file_name
    with DownloadProgressBar(
        unit='B', 
        unit_scale=True,
        miniters=1, 
        desc=url.split("/")[-1]
    ) as t:
        urllib.request.urlretrieve(
            url, 
            filename=full_path, 
            reporthook=t.update_to
        )
    with zipfile.ZipFile(full_path, "r") as zip_ref:
        zip_ref.extractall(output_path)

In [4]:
# download_url(SOURCE, OUTPUT_PATH, FILE_NAME)

In [5]:
df0 = pd.read_csv("../../grab-ai-safety-data/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df1 = pd.read_csv("../../grab-ai-safety-data/features/part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df2 = pd.read_csv("../../grab-ai-safety-data/features/part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df3 = pd.read_csv("../../grab-ai-safety-data/features/part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df4 = pd.read_csv("../../grab-ai-safety-data/features/part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df5 = pd.read_csv("../../grab-ai-safety-data/features/part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df6 = pd.read_csv("../../grab-ai-safety-data/features/part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df7 = pd.read_csv("../../grab-ai-safety-data/features/part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df8 = pd.read_csv("../../grab-ai-safety-data/features/part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
df9 = pd.read_csv("../../grab-ai-safety-data/features/part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv")
response = pd.read_csv("../../grab-ai-safety-data/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv")

## Merge and drop duplicates
- Join the feautres together with the labels
- Get rid of any obvious duplicates in the features and response
- No data cleaning or formatting to minimize data leakage

In [6]:
df_features = pd.concat(
    [df1, df2, df3, df4, df5, df6, df7, df8, df9], 
    axis=0
).drop_duplicates(
    keep=False
)

response = response.drop_duplicates(
    subset="bookingID", 
    keep=False
)

In [7]:
df = pd.merge(
    df_features,
    response,
    how="inner",
    on="bookingID"
).sort_values(
    ["bookingID", "second"],
    ascending=True
)

In [8]:
with open('../../grab-ai-safety-data/df_full.pickle', 'wb') as f:
    pickle.dump(df, f)