## Import a tsf file

In [1]:
# Import all the libraries
import os
import aeon
from aeon.datasets import load_from_tsf_file, write_to_tsfile
import pandas as pd

In [2]:
DATA_PATH = os.path.join(os.path.dirname(aeon.__file__), "datasets\\data\\")
data, metadata = load_from_tsf_file(DATA_PATH+"m1_yearly_dataset\\m1_yearly_dataset.tsf")

In [3]:
data = data.head(5)
data

Unnamed: 0,series_name,start_timestamp,series_value
0,T1,1972-01-01,"[3600.0, 7700.0, 12300.0, 30500.0, 47390.0, 57..."
1,T2,1974-01-01,"[12654.0, 22879.0, 34164.0, 49524.0, 64761.0, ..."
2,T3,1974-01-01,"[2142.0, 12935.0, 19130.0, 30500.0, 48177.0, 5..."
3,T4,1974-01-01,"[5774.0, 7650.0, 9271.0, 21447.0, 28998.0, 409..."
4,T5,1976-01-01,"[432312.0, 569011.0, 862673.0, 1155640.0, 1439..."


In [4]:
type(data.iloc[0,1])

pandas._libs.tslibs.timestamps.Timestamp

In [5]:
indices=[]
datas=[]
for i in range(0,4):
    indices.append(pd.date_range(start=data['start_timestamp'][i], periods=len(data['series_value'][i]), freq='YS').tolist())
    datas.append(pd.Series(data['series_value'][i], index=indices[i]))

In [6]:
df = pd.DataFrame({'T1':datas[0], 'T2':datas[1],'T3':datas[2],'T4':datas[3]})
df.reset_index(drop=False, inplace=True)
df.head()

Unnamed: 0,index,T1,T2,T3,T4
0,1972-01-01,3600.0,,,
1,1973-01-01,7700.0,,,
2,1974-01-01,12300.0,12654.0,2142.0,5774.0
3,1975-01-01,30500.0,22879.0,12935.0,7650.0
4,1976-01-01,47390.0,34164.0,19130.0,9271.0


In [7]:
df['index'] = pd.to_datetime(df['index'])
df['index'] = df['index'].dt.strftime('%Y-%m-%d')
df.iloc[0,0]

'1972-01-01'

In [8]:
df.shape

(31, 5)

In [9]:
metadata

{'frequency': 'yearly',
 'forecast_horizon': 6,
 'contain_missing_values': False,
 'contain_equal_length': False}

## The purpose is to convert the dataframe df into a tsf file.

_write_to_tsf_file():

    write_dataframe_to_tsf_file():
    
        _write_header_tsf()

In [10]:
import os
import textwrap
import numpy as np
import pandas as pd

In [11]:
def write_to_tsf_file(
    X, path, y=None, problem_name="sample_data.tsf", header=None, horizon=0
):
    """Write an aeon collection of time series to text file in .tsf format.

    Write metadata and data stored in aeon compatible data set to file.
    A description of the tsf format is in examples/load_data.ipynb.

    Note that this file is structured to still support the

    Parameters
    ----------
    X : pd.DataFrame, each cell a pd.Series
        Collection of time series: univariate, multivariate, equal or unequal length.
    path : string.
        Location of the directory to write file
    y: None or pd.Series, default = None
        Response variable, discrete for classification, continuous for regression
        None if clustering.
    problem_name : string, default = "sample_data"
        The file is written to <path>/<problem_name>/<problem_name>.tsf
    header: string, default = None
        Optional text at the top of the file that is ignored when loading.
    """
    if not (
        isinstance(X, pd.DataFrame)
    ):
        raise TypeError(
            f" Wrong input data type {type(X)} convert to pd.DataFrame"
        )

    # See if passed file name contains .tsf extension or not
    split = problem_name.split(".")
    if split[-1] != "tsf":
        problem_name = problem_name + ".tsf"

    _write_dataframe_to_tsf_file(
        X, 
        path, 
        problem_name, 
        y=y,
        horizon=horizon,
        comment=header
    ) 

In [12]:
def _write_dataframe_to_tsf_file(
    X, path, problem_name="sample_data", y=None, horizon=0, comment=None
):
    # ensure data provided is a dataframe
    if not isinstance(X, pd.DataFrame):
        raise ValueError(f"Data provided must be a DataFrame, passed a {type(X)}")
    # See if passed file name contains .tsf extension or not
    split = problem_name.split(".")
    if split[-1] != "tsf":
        problem_name = problem_name + ".tsf"
    equal_length = not X.isnull().values.any()
    missing = X.isnull().values.any()
    columns = X.dtypes.to_dict()
    for i in columns.keys():
        if columns[i]=='float64' or columns[i]=='int32' or columns[i]=='int64':
            columns[i]='numeric'
        if columns[i]=='datetime64[ns]':
            columns[i]='date'
        else:
            columns[i]='string'
    file = _write_header_tsf(
        path,
        problem_name,
        attribute=columns,
        equal_length=equal_length,
        frequency=calculate_frequency(X),
        horizon=horizon,
        missing=missing,
        comment=comment,
        suffix=None,
    )
    n_cases, n_channels = X.shape
    
    for j in range(0, n_channels):
        column_name = X.columns[j]
        file.write(f"{column_name}:")
        
        for i in range(0, n_cases):
            series = X.iloc[i, j]
            # Check if the value is NaN
            if pd.notna(series):
                series_str = str(series)
            else:
                series_str = '?'  # Replace NaN with a ?
                
            # Write the series string to the file
            file.write(f"{series_str},")
        # Check if y is not None before accessing its elements
        if y is not None:
            file.write(f"{y[i]}\n")
        else:
            file.write("\n")  # Write a newline if y is None
    file.close()


In [13]:
def calculate_frequency(df):
    # Convert timestamps to DateTime format
    df['Timestamp'] = pd.to_datetime(df.index)

    # Calculate time differences
    time_diffs = df['Timestamp'].diff().dropna()

    # Calculate median time difference
    median_diff = time_diffs.median()

    # Determine frequency based on median time difference
    if median_diff <= pd.Timedelta(days=1):
        frequency = "daily"
    elif median_diff <= pd.Timedelta(weeks=1):
        frequency = "weekly"
    elif median_diff <= pd.Timedelta(days=30):
        frequency = "monthly"
    elif median_diff <= pd.Timedelta(days=365):
        frequency = "yearly"
    else:
        frequency = "other"  # You can define more granular frequencies as needed
    df.drop('Timestamp', axis=1, inplace=True)

    return frequency

In [14]:
def _write_header_tsf(
    path,
    problem_name,
    attribute={'col':'data_type'},
    equal_length=True,
    frequency="weekly",
    horizon=0,
    missing=False,
    comment=None,
    suffix=None,
):
    if not os.path.exists(path):
        os.makedirs(path)
    if suffix is not None:
        load_path = load_path + suffix   
    # See if passed file name contains .tsf extension or not
    split = problem_name.split(".")
    if split[-1] != "tsf":
        problem_name = problem_name + ".tsf"       
    load_path = f"{path}/{problem_name}"
    
    file = open(load_path, "w")

    if comment is not None:
        file.write("\n# ".join(textwrap.wrap("# " + comment)))
        file.write("\n")
        
    file.write(f"@relation {str(split[0]).lower()}\n")
    # Write attribute metadata for each column
    if attribute is not None:
        for attr in attribute:
            file.write(f"@attribute {str(attr)} {str(attribute[attr])}\n")
    file.write(f"@frequency {str(frequency).lower()}\n")
    file.write(f"@horizon {str(horizon).lower()}\n")
    file.write(f"@missing {str(missing).lower()}\n")
    file.write(f"@equallength {str(equal_length).lower()}\n")
    file.write("@data\n")

    return file


In [15]:
write_to_tsf_file(df, "D:\Python\Project\Aeon", y=None, problem_name="sample_data", header=None, horizon=0)

In [16]:
data, metadata = load_from_tsf_file("D:\Python\Project\Aeon\sample_data.tsf")

Exception: Missing attributes/values in series.