<a href="https://colab.research.google.com/github/liuy01510/portfolio/blob/master/Python/Machine-Learning/Datasets/Dataset_Loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Importing modules
import numpy as np
import scipy as sp
import pandas as pd
import collections as coll
from zipfile import ZipFile
import requests
import os



# Zipfile Extractor Module
def Zip_Extractor(file_loc,all_files=False,extract_loc=None,return_dataframe=True,**kwargs):
    """
    Extract all files or a specific file from the given filepath.

    Args:

    - file_loc:str = location path of the zip file.

    - all_files:bool = True if all files within the zip file must be extracted. False to extract specific file.

    - extract_loc:str = Location to place the extracted file. If not passed,\
 default location will be the same folder as the zip file.

    - return_dataframe:bool = Only applicable if the file being extracted is a .csv file.\
 Will throw an error if filetype is not a .csv file.
    """

    # Writing the extraction path
    if extract_loc==None:
        extLoc="/".join(file_loc.split('/')[:-1])
    else:
        try:
            os.mkdir(extract_loc) # creating the path if not created yet.
        except:
            pass
        extLoc=extract_loc # saving to the user specified path.
    
    
    # If all files must be extracted
    if all_files==True:
        with ZipFile(file_loc) as zf:
            zf.extractall(path=extLoc)
        return None
    
    # Extract only 1 specific file
    with ZipFile(file_loc) as zf:
        zfNameList=zf.namelist() # returns a list containing the list of all the files.
        for i,n in enumerate(zfNameList):
            print(f"{i} --> {n}") # print out the list of index and corresponding file names
        userSelection=None
        while userSelection not in list(range(i+1)):
            userSelection=int(input('Please select a file to be extracted based on its index.'))
        userSelection=zfNameList[userSelection] # sets the userSelection to be the filename selected by the user.
        zf.extract(userSelection,extLoc)
        
    
    # Check if need to return as dataframe
    if return_dataframe==False:
        return None
    
    extLoc=extLoc+f"/{userSelection}" # path of the csv file
    return pd.read_csv(extLoc)

# Wrapper for the different datasets loading function
def Load_Dataset(url,source='sg_data',save_location='/content',**kwargs):
    """
    Used to load datasets from various sources.

    Args:

    - url:str = url to load the dataset from.

    - source:('sg_data',) = Will affect the method of parsing the data. Will be updated in the future to include more dataset sources.

    - save_loc:str = Location to save the data. Default is '/content'

    - **kwargs

        - filename:str = Name of saved file. Default is 'dataset'
    """

    def Load_SG_Dataset(url,save_location=None,**kwargs):
        """
        Used to load datasets from data.gov source.
        """

        # Requesting the zip file
        req=requests.get(url)
        res=req.content

        # Writing the response byte file as a zip file into the local filesystem.
        try:
            fileName=kwargs['filename']
        except:
            fileName='dataset'
        
        fileName=fileName
        save_loc=save_location+f'/{fileName}'
        with open(save_loc,mode='wb') as zf:
            zf.write(res)
        
        # Extracting the CSV File
        userRes=input("Extract the CSV file (Y/N)?")

        while userRes not in ['Y','N']:
            userRes=input("Extract the CSV file (Y/N)?")
        
        if userRes=='N':
            return None
        else:
            result=Zip_Extractor(save_loc)
            return result
    
    # Types of loading functions
    loaderTypes={}
    loaderTypes['sg_data']=Load_SG_Dataset

    # Running the specific loading function
    loader=loaderTypes[source]
    result=loader(url,save_location=save_location,**kwargs)

    return result