In [21]:
from pathlib import Path
import tarfile
import urllib.request
import pandas as pd


def load_dataset(data_file: str, extension: str = ".tar.xz"):
    """Download and extract a a file from a URL, and load a CSV file into a DataFrame.

    Args:
        data_file (str): name of the CSV file
        extension (str, optional): extension of the compressed file. Defaults to ".tar.xz".

    Returns:
        _type_: pandas.DataFrame
    """
    tarball_path = Path(f"datasets/{data_file + extension}")
    data_url = "https://github.com/joaoalexarruda/data_ipp/raw/main/" + data_file + extension
    if not tarball_path.is_file():
        Path("datasets").mkdir(exist_ok=True, parents=True)
        urllib.request.urlretrieve(data_url, tarball_path)
        with tarfile.open(tarball_path) as tar:
            tar.extractall(path="datasets")
    return pd.read_csv(f"datasets/{data_file}/{data_file}.csv")

In [22]:
df = load_dataset(data_file='fishcatch', extension='.tar.xz')
df.head(2)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB
