In [55]:
import requests
import urllib

import os
import shutil
import glob

from datetime import datetime
from distutils.util import strtobool

import datatable as dt 
import pandas as pd 

from bs4 import BeautifulSoup

from tqdm import tqdm

import logging
logging.basicConfig(filename='ingest.log', encoding='utf-8', level=logging.INFO)


df = dt.fread("monash-repository.csv")
df.head()

Unnamed: 0_level_0,Dataset,Domain,No: of Series,Min. Length,Max. Length,Competition,Multivariate,Download,Source,URL,Frequency
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,M1,Multiple,1001,15,150,Yes,No,Yearly,"Makridakis et al., 1982",https://zenodo.org/record/4656193,Yearly
1,M1,Multiple,1001,15,150,Yes,No,Quarterly,"Makridakis et al., 1982",https://zenodo.org/record/4656154,Quarterly
2,M1,Multiple,1001,15,150,Yes,No,Monthly,"Makridakis et al., 1982",https://zenodo.org/record/4656159,Monthly
3,M3,Multiple,3003,20,144,Yes,No,Yearly,"Makridakis and Hibon, 2000",https://zenodo.org/record/4656222,Yearly
4,M3,Multiple,3003,20,144,Yes,No,Quarterly,"Makridakis and Hibon, 2000",https://zenodo.org/record/4656262,Quarterly
5,M3,Multiple,3003,20,144,Yes,No,Monthly,"Makridakis and Hibon, 2000",https://zenodo.org/record/4656298,Monthly
6,M3,Multiple,3003,20,144,Yes,No,Other,"Makridakis and Hibon, 2000",https://zenodo.org/record/4656335,Other
7,M4,Multiple,100000,19,9933,Yes,No,Yearly,"Makridakis et al., 2020",https://zenodo.org/record/4656379,Yearly
8,M4,Multiple,100000,19,9933,Yes,No,Quarterly,"Makridakis et al., 2020",https://zenodo.org/record/4656410,Quarterly
9,M4,Multiple,100000,19,9933,Yes,No,Monthly,"Makridakis et al., 2020",https://zenodo.org/record/4656480,Monthly


In [56]:
df = df.to_pandas()

In [57]:
df.shape

(58, 11)

In [58]:
x = df['URL'][0]
x

'https://zenodo.org/record/4656193'

# Utility Functions

In [59]:

def convert_tsf_to_dataframe(full_file_path_and_name, replace_missing_vals_with="NaN", value_column_name="series_value"):
    '''
    Converts the contents in a .tsf file into a dataframe and returns it along with other meta-data of the dataset.
    
        Parameters:
            full_file_path_and_name (str): complete .tsf file path
            replace_missing_vals_with (str): a term to indicate the missing values in series in the returning dataframe
            value_column_name (str): Any name that is preferred to have as the name of the column containing series values in the returning dataframe
    
        Returns:
            data (pd.DataFrame): load data frame
            frequency (str): time series frequency
            horizon (int): time series forecasting horizon
            missing (bool): whether the dataset contains missing values
            equal (bool): whether the series have equal lengths
    
    '''
    
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)

        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )

def parse_monash_df(file):
    ''' Function to Parse a Locally Extracted and Downloaded File'''
    
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(file)

    logging.info(f'PARSING FILE: {file}...')
    logging.info(f"IDENTIFIED FREQUENCY: {frequency}...")
    logging.info(f"IDENTIFIED FORECAST HORIZON: {forecast_horizon}")

    parsed_df = pd.DataFrame()

    #freq = frequency
    if frequency == 'yearly':
        freq = 'YS' #year start
    elif frequency == 'quarterly':
        freq = 'QS' #quarter start
    elif frequency == 'monthly':
        freq = 'MS'
    elif frequency == 'daily':
        freq = 'D'


    for index,row in tqdm(loaded_data.iterrows()):
        
        name = row.series_name
        #print(name)
        values = row.series_value.tolist()
        length = len(values)
        start = row.start_timestamp
        
        #print(f'STARTING TIMESTAMP: {start}')
        #$print(f'TIMESERIES LENGTH: {length}')
        
        #print(length)
        try:
            ds = pd.date_range(start, periods=length, freq=freq)
            series_df = pd.DataFrame({'unique_id':name,'ds':ds, 'values':values})
        except:
            logging.warning(f'FAILED PARSING TIMESERIES: {name}')
            series_df = pd.DataFrame()
            
        #convert date range to datetime and automatically coerce errors
        #ds = pd.to_datetime(ds, errors = 'coerce')

        
        #series_df = pd.DataFrame({'unique_id':name,'ds':ds, 'values':values})
        parsed_df = pd.concat([parsed_df, series_df], axis=0)
        
    return parsed_df

# Example Usage
#parse_monash_df("m1_yearly_dataset.tsf").head()

def retrieve_monash_df(url):
    
    # Create Soup
    page = requests.get(url)
    
    # Create Soup
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Find Download URL from Filename Class
    info = soup.select("[class~=filename]")
    
    # Parse Download Url
    download_url = info[0].get('href')
    
    # Download File
    urllib.request.urlretrieve("https://zenodo.org" + download_url, "tmp.zip")
    
# Example Usage
#retrieve_monash_df(df['URL'][1])


In [17]:
# Focus on just a few datasets
df = df[df['Frequency'].isin(['Daily'])]
df.head()

Unnamed: 0,Dataset,Domain,No: of Series,Min. Length,Max. Length,Competition,Multivariate,Download,Source,URL,Frequency
11,M4,Multiple,100000,19,9933,Yes,No,Daily,"Makridakis et al., 2020",https://zenodo.org/record/4656548,Daily
30,Weather,Nature,3010,1332,65981,No,No,Daily,"Sparks et al., 2020",https://zenodo.org/record/4654822,Daily
49,COVID Deaths,Nature,266,212,212,No,Yes,Daily,"Johns Hopkins University, 2020",https://zenodo.org/record/4656009,Daily
54,Saugeen River Flow,Nature,1,23741,23741,No,No,Daily,"McLeod and Gweon, 2013",https://zenodo.org/record/4656058,Daily
55,US Births,Nature,1,7305,7305,No,No,Daily,"Pruim et al., 2020",https://zenodo.org/record/4656049,Daily


In [33]:
project_path = '/Users/jfarland/Documents/research/timeseries-benchmarks'
data_path = project_path + '/data'
data_path

'/Users/jfarland/Documents/research/timeseries-benchmarks/data'

In [60]:
# limit to 1 data set
sample_df = df[df['Frequency'].isin(['Daily', 'Monthly', 'Yearly'])]

#sample_df = sample_df[sample_df['Multivariate']=='Yes']
#sample_df = sample_df[(sample_df['Download']=='Yearly') & (sample_df['Dataset'] == 'M4')]

print(sample_df)

for index, row in tqdm(sample_df.iterrows()):
    
    # Make sure current directory is data directory
    os.chdir(data_path)
    
    # Remove Archive File if it already exists in 
    if os.path.exists(data_path + '/tmp.zip'):
        logging.info(f'DELETING EXISTING ARCHIVE FILE...')
        os.remove(data_path + '/tmp.zip')
        
    # Identify url from mapping file
    url = row.URL
    logging.info(f'RETRIEVING URL: {url}')
    
    # Retrieve the file from the url
    retrieve_monash_df(url)
    
    if not os.path.exists('staging'):
        logging.info("STAGING DIRECTORY NOT FOUND, CREATING NEW ONE...")
        os.makedirs('staging')
    else:
        try:
            logging.info("STAGING DIRECTORY FOUND, CLEANING AND RECREATING...")
            shutil.rmtree('staging')
            os.makedirs('staging')
        except OSError as e:
            logging.warning("Error: %s : %s" % (dest, e.strerror))
        
    # Copy Archive File into Staging Directory
    shutil.copy('tmp.zip', 'staging')
    
    # Go into the directory and unpack
    os.chdir('staging')
        
    shutil.unpack_archive('tmp.zip')
    
    # Find any TSF Files
    result = glob.glob('*.{}'.format('tsf'))
    
    # Find the name of the file 
    local_name = result[0].split(".")[0]
    logging.info(f'DATASET NAME: {local_name}...')
    
    # Parse the DataFrame
    local_df = parse_monash_df(result[0])
    
    # Convert to DataTable
    local_df = dt.Frame(local_df)
    
    os.chdir(data_path)
    
    if not os.path.exists(local_name):
        logging.info("DATASET DIRECTORY NOT FOUND, CREATING NEW ONE...")
        os.makedirs(local_name)
    else:
        try:
            logging.info("DATASET DIRECTORY FOUND, CLEANING AND RECREATING...")
            shutil.rmtree(local_name)
            os.makedirs(local_name)
        except OSError as e:
            logging.warning("Error: %s : %s" % (dest, e.strerror))
        
    local_df.to_csv(local_name + '/' + local_name + '.csv')
    
    
 




  Dataset    Domain  No: of Series  Min. Length  Max. Length Competition  \
7      M4  Multiple         100000           19         9933         Yes   

  Multivariate Download                   Source  \
7           No   Yearly  Makridakis et al., 2020   

                                 URL Frequency  
7  https://zenodo.org/record/4656379    Yearly  


23000it [01:20, 284.50it/s]
1it [01:33, 93.63s/it]


In [24]:
from datetime import datetime
from distutils.util import strtobool

import pandas as pd


# Converts the contents in a .tsf file into a dataframe and returns it along with other meta-data of the dataset: frequency, horizon, whether the dataset contains missing values and whether the series have equal lengths
#
# Parameters
# full_file_path_and_name - complete .tsf file path
# replace_missing_vals_with - a term to indicate the missing values in series in the returning dataframe
# value_column_name - Any name that is preferred to have as the name of the column containing series values in the returning dataframe
def convert_tsf_to_dataframe(
    full_file_path_and_name,
    replace_missing_vals_with="NaN",
    value_column_name="series_value",
):
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)

        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )


In [25]:
# Example of usage
loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("m1_yearly_dataset.tsf")

print(loaded_data)
print(frequency)
print(forecast_horizon)
print(contain_missing_values)
print(contain_equal_length)

    series_name start_timestamp  \
0            T1      1972-01-01   
1            T2      1974-01-01   
2            T3      1974-01-01   
3            T4      1974-01-01   
4            T5      1976-01-01   
..          ...             ...   
176        T177      1974-01-01   
177        T178      1973-01-01   
178        T179      1973-01-01   
179        T180      1975-01-01   
180        T181      1975-01-01   

                                          series_value  
0    [3600.0, 7700.0, 12300.0, 30500.0, 47390.0, 57...  
1    [12654.0, 22879.0, 34164.0, 49524.0, 64761.0, ...  
2    [2142.0, 12935.0, 19130.0, 30500.0, 48177.0, 5...  
3    [5774.0, 7650.0, 9271.0, 21447.0, 28998.0, 409...  
4    [432312.0, 569011.0, 862673.0, 1155640.0, 1439...  
..                                                 ...  
176  [290783.0, 285242.0, 293718.0, 295804.0, 29458...  
177  [11693.0, 11702.0, 11703.0, 11557.0, 11951.0, ...  
178  [8438.0, 8689.0, 8590.0, 8763.0, 8710.0, 8837....  
179  [55.

In [26]:
type(loaded_data)

pandas.core.frame.DataFrame

In [27]:
loaded_data.head()

Unnamed: 0,series_name,start_timestamp,series_value
0,T1,1972-01-01,"[3600.0, 7700.0, 12300.0, 30500.0, 47390.0, 57..."
1,T2,1974-01-01,"[12654.0, 22879.0, 34164.0, 49524.0, 64761.0, ..."
2,T3,1974-01-01,"[2142.0, 12935.0, 19130.0, 30500.0, 48177.0, 5..."
3,T4,1974-01-01,"[5774.0, 7650.0, 9271.0, 21447.0, 28998.0, 409..."
4,T5,1976-01-01,"[432312.0, 569011.0, 862673.0, 1155640.0, 1439..."


In [32]:
import os

#import sktime
#from sktime.utils.data_io import load_from_tsfile_to_dataframe
from sktime.datatypes import check_is_scitype

check_is_scitype(loaded_data)



#DATA_PATH = os.path.join(os.path.dirname(sktime.__file__), "datasets/data")

# train_x, train_y = load_from_tsfile_to_dataframe(
#     os.path.join(DATA_PATH, "ArrowHead/ArrowHead_TRAIN.ts")
# )
# test_x, test_y = load_from_tsfile_to_dataframe(
#     os.path.join(DATA_PATH, "ArrowHead/ArrowHead_TEST.ts")

TypeError: check_is_scitype() missing 1 required positional argument: 'scitype'

In [33]:
loaded_data


Unnamed: 0,series_name,start_timestamp,series_value
0,T1,1972-01-01,"[3600.0, 7700.0, 12300.0, 30500.0, 47390.0, 57..."
1,T2,1974-01-01,"[12654.0, 22879.0, 34164.0, 49524.0, 64761.0, ..."
2,T3,1974-01-01,"[2142.0, 12935.0, 19130.0, 30500.0, 48177.0, 5..."
3,T4,1974-01-01,"[5774.0, 7650.0, 9271.0, 21447.0, 28998.0, 409..."
4,T5,1976-01-01,"[432312.0, 569011.0, 862673.0, 1155640.0, 1439..."
...,...,...,...
176,T177,1974-01-01,"[290783.0, 285242.0, 293718.0, 295804.0, 29458..."
177,T178,1973-01-01,"[11693.0, 11702.0, 11703.0, 11557.0, 11951.0, ..."
178,T179,1973-01-01,"[8438.0, 8689.0, 8590.0, 8763.0, 8710.0, 8837...."
179,T180,1975-01-01,"[55.91, 54.7, 55.3, 55.75, 55.46, 55.37, 53.82..."


In [34]:
frequency

'yearly'

In [35]:
row = loaded_data.head(1)
row

Unnamed: 0,series_name,start_timestamp,series_value
0,T1,1972-01-01,"[3600.0, 7700.0, 12300.0, 30500.0, 47390.0, 57..."


In [51]:
from tqdm import tqdm

parsed_df = pd.DataFrame()

#freq = frequency
if frequency == 'yearly':
    freq = 'YS' #year start
elif frequency == 'quarterly':
    freq = 'QS' #quarter start
elif frequency == 'monthly':
    freq = 'MS'


for index,row in tqdm(loaded_data.iterrows()):
    
    name = row.series_name
    #print(name)
    values = row.series_value.tolist()
    length = len(values)
    start = row.start_timestamp
    #print(length)
    ds = pd.date_range(start, periods=length, freq=freq)
    
    series_df = pd.DataFrame({'unique_id':name,'ds':ds, 'values':values})
    parsed_df = pd.concat([parsed_df, series_df], axis=0)
    
parsed_df.head()
    
    
    
    
    




181it [00:00, 1246.58it/s]


Unnamed: 0,unique_id,ds,values
0,T1,1972-01-01,3600.0
1,T1,1973-01-01,7700.0
2,T1,1974-01-01,12300.0
3,T1,1975-01-01,30500.0
4,T1,1976-01-01,47390.0


In [84]:
def parse_monash_df(file):
    
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(file)

    parsed_df = pd.DataFrame()

    #freq = frequency
    if frequency == 'yearly':
        freq = 'YS' #year start
    elif frequency == 'quarterly':
        freq = 'QS' #quarter start
    elif frequency == 'monthly':
        freq = 'MS'
    elif frequency == 'daily':
        freq = 'D'


    for index,row in tqdm(loaded_data.iterrows()):
        
        name = row.series_name
        #print(name)
        values = row.series_value.tolist()
        length = len(values)
        start = row.start_timestamp
        #print(length)
        ds = pd.date_range(start, periods=length, freq=freq)
        
        series_df = pd.DataFrame({'unique_id':name,'ds':ds, 'values':values})
        parsed_df = pd.concat([parsed_df, series_df], axis=0)
        
    return parsed_df

#parse_monash_df("m1_yearly_dataset.tsf").head()