In [454]:
import numpy as np

import pandas as pd

from gtda.time_series import SingleTakensEmbedding     # Implement the sliding window and the time delay embedding method.

from ripser import ripser                              # The Ripser package is used to compute Persistent Homology.

import matplotlib.pyplot as plt

import itertools                                       # This package is primarily used to handle iterable objects.
                                                                                 # （ 我們主要用來將二維陣列合併成一維為陣列 ）

## （1）Sliding Window Method

In [456]:
# sliding shift = 1 

# signal is a 1-dimensional array of time series data.

def sliding_windows(signal, window_size):
    
    window = SingleTakensEmbedding(parameters_type = "fixed", dimension = window_size)
    
    sliding_windows = window.fit_transform(signal)
    
    return sliding_windows

## （2）TDA and Feature Extraction

### <font color = SteelBlue >Takens Embedding and Persistent Homology（時間延遲嵌入與持久同調）</font>

In [459]:
#                                                                                                                                   #
# The takens_embedding method transforms a time series signal into a point cloud.                                                   #
#                                                                                                                                   #
# The persistent_homology method is used to compute Persistent Homology and transforms a point cloudd into a persistence diagram.   #
#                                                                                                                                   #
#####################################################################################################################################

def takens_embedding(signal, embedding_dimension):
    
    embedder = SingleTakensEmbedding(parameters_type = "fixed", dimension = embedding_dimension)
    
    point_cloud = embedder.fit_transform(signal)
    
    return point_cloud

def persistent_homology(point_cloud):
    
    persistence_diagram = ripser(point_cloud)['dgms']
    
    return persistence_diagram 

### <font color = SteelBlue >Feature Extraction 1：get_diagram_lives（計算持續圖所有點的壽命）</font>

In [461]:
# Compute the life(death - birth) of all points in the persistence diagram. 

def get_diagram_lives(diagram):
    
    return diagram[:,1] - diagram[:,0] if len(diagram) > 0 else np.array([0])

### <font color = SteelBlue >Feature Extraction 2：get_diagram_entropy（計算持續傷）</font>

In [463]:
# Compute the persistence entropy.

def get_diagram_entropy(lives):
    
    if max(abs(lives)) == 0:
        
        return 0.
        
    else:
        
        normalized_lives = lives / sum(lives)
        
        return sum(-normalized_lives * np.log(normalized_lives))

### <font color = SteelBlue >Feature Extraction 3：get_diagram_features（ 將全部三個特徵整理在一組陣列）</font>

In [465]:
# The get_diagram_features method organizes all three features into a single array.

def get_diagram_features(diagram):
    
    diagram = diagram[~np.any(np.isinf(diagram), axis = 1)]   # Remove points with inf values from the persistence diagram (for H0).
    
    lives = get_diagram_lives(diagram)
    
    bottleneck_distance = max(lives) / np.sqrt(2)        # feature 1（ bottleneck distance ）
    
    wasserstein_distance = sum(lives) / np.sqrt(2)       # feature 2（ wasserstein distance ）
    
    persistence_entropy = get_diagram_entropy(lives)     # feature 3（ persistence entropy ）
    
    result = [bottleneck_distance, wasserstein_distance, persistence_entropy] 
    
    return result

## （3）UCR Time Series Dataset

In [467]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def beef_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Beef_TRAIN.txt', sep = '\s+', header = None)
    
    data_test = pd.read_csv('UCR_datasets/Beef_TEST.txt', sep = '\s+', header = None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [468]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def coffee_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Coffee_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/Coffee_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [469]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def ham_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Ham_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/Ham_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [470]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def meat_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Meat_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/Meat_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [471]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def oliveoil_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/OliveOil_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/OliveOil_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [472]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def strawberry_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Strawberry_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/Strawberry_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

In [473]:
# This method splits the dataset into training and testing sets and converts them into DataFrames.

def wine_data():
    
    # Read a txt file and convert it into a DataFrame.
    
    data_train = pd.read_csv('UCR_datasets/Wine_TRAIN.txt', sep='\s+', header=None)
    
    data_test = pd.read_csv('UCR_datasets/Wine_TEST.txt', sep='\s+', header=None)
    
    # Separate the targets from the time series signal.
    
    train_targets = data_train.iloc[:, 0].astype(int)  
    
    train_signals = data_train.iloc[:, 1:].astype(float)  

    test_targets = data_test.iloc[:, 0].astype(int)  
    
    test_signals = data_test.iloc[:, 1:].astype(float)  

    # Convert the time series data in each row into a list format.
    
    train_signalss = []
    
    test_signalss = []
    
    train_signals_list = train_signals.values.tolist()
    
    for row in train_signals_list:
        
        train_signalss.append(row)
    
    test_signals_list = test_signals.values.tolist()
    
    for row in test_signals_list:
        
        test_signalss.append(row)

    # Create a new DataFrame where the signal is in list format.
    
    df_train = pd.DataFrame({
        
        'signal': train_signalss,
        
        'target': train_targets
        
    })

    df_test = pd.DataFrame({
        
        'signal': test_signalss,
        
        'target': test_targets
        
    })

    return df_train, df_test

## （4）Machine Learning 

In [475]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [476]:
# For Beef

def beef_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 3000, C = 0.01)))])
    
    param_grid = {
        
    'bc__n_estimators': [50],
        
    'bc__random_state': [14]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 5, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))
    
    return bc_accuracy

# For Coffee

def coffee_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 1800)))])
    
    param_grid = {
        
    'bc__n_estimators': [100, 200],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__random_state': [14]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))
    
    return bc_accuracy

# For Ham

def ham_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 2000, solver = 'lbfgs')))])
    
    param_grid = {
        
    'bc__n_estimators': [10, 20, 30, 50, 100],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__max_features': [1.0, 0.8],
        
    'bc__random_state': [45]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))

    return bc_accuracy

# For Meat

def meat_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 2000, C = 0.01, solver = 'sag')))])
    
    param_grid = {
        
    'bc__n_estimators': [10, 20, 30, 50, 100],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__max_features': [1.0, 0.8],
        
    'bc__random_state': [42]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))
    
    return bc_accuracy

# For Olive oil

def oliveoil_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 3500, C = 0.01, solver = 'sag')))])
    
    param_grid = {
        
    'bc__n_estimators': [10, 20, 30, 50, 100],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__max_features': [1.0, 0.8],
        
    'bc__random_state': [14]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))
    
    return bc_accuracy
    
# For Strawberry 

def strawberry_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = KNeighborsClassifier(n_neighbors = 1, algorithm = 'ball_tree', leaf_size = 20, p = 1)))])
    
    param_grid = {
        
    'bc__n_estimators': [10, 20, 30, 50, 100],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__max_features': [1.0, 0.8],
        
    'bc__random_state': [14]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))

    return bc_accuracy

# For Wine

def wine_classification_model(train_features, train_target, test_features, test_target):
    
    X_train = train_features
    
    y_train = train_target
    
    X_test = test_features
    
    y_test = test_target

    pipe = Pipeline([('scaler', StandardScaler()), ('bc', BaggingClassifier(estimator = LogisticRegression(max_iter = 1800)))])
    
    param_grid = {
        
    'bc__n_estimators': [10, 30, 50, 100],
        
    'bc__max_samples': [1.0, 0.8, 0.6],
        
    'bc__random_state': [3]
        
    }

    grid_search = GridSearchCV(pipe, param_grid, cv = 3, scoring = 'accuracy', verbose = 1, n_jobs = -1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳參數: {grid_search.best_params_}")
    
    print(f"最佳交叉驗證準確率: {grid_search.best_score_}")
    
    y_pred = grid_search.predict(X_test)

    bc_accuracy = round(100 * (accuracy_score(y_test, y_pred)), 3)
    
    print('bc Test accuracy:', accuracy_score(y_test, y_pred))  

    return bc_accuracy


## （5）Convert Series to DataFrame：

In [478]:
# convert pd.Series([[1,2,3],[4,5,6]]) to pd.DataFrame([[1,2,3],[4,5,6]])

def pd_series_to_dataframe(series):
    
    return pd.DataFrame.from_dict(dict(zip(series.index, series.values))).T

## （6）Main Experiment（研究主體）

In [480]:
# beef main experiment

def beef_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the beef_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = beef_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = beef_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [481]:
# coffee main experiment

def coffee_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the coffee_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = coffee_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = coffee_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [482]:
# ham main experiment

def ham_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the ham_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = ham_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = ham_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [483]:
# meat main experiment

def meat_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the meat_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = meat_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = meat_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [484]:
# olive oil main experiment

def oliveoil_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the oliveoil_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = oliveoil_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = oliveoil_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [485]:
# strawberry main experiment

def strawberry_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the strawberry_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = strawberry_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = strawberry_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


In [486]:
# wine main experiment

def wine_experiment(embedding_dimension = 2):

    all_classification_result = []   # Store all classification results.
    
    # Step 1. Call the wine_data() method to obtain the beef dataset and convert it into Series format.
    
    train_data, test_data = wine_data()

    train_data_series = pd.Series(train_data['signal'].iloc[:])
    
    test_data_series = pd.Series(test_data['signal'].iloc[:])

    window_size = [5, 10, 45, 50, 95, 100]        # (5, 10) = small size，(45, 50) = medium size，(95, 100) = large size

    # Step 2. Apply a sliding window method to all time series in the dataset.
    
    for size in window_size:
        
        train_sliding_windows = train_data_series.map(lambda x: sliding_windows(x, size))
        
        test_sliding_windows = test_data_series.map(lambda x: sliding_windows(x, size))

        # Step 3. Apply time-delay embedding method to each window and obtain a 2-dimensional point cloud.
        
        train_point_cloud = train_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])
        
        test_point_cloud = test_sliding_windows.map(lambda x: [takens_embedding(row, embedding_dimension) for row in x])

        # Step 4. Convert each point cloud into a persistence diagram.
        
        train_persistence_diagrams = train_point_cloud.map(lambda x: [persistent_homology(row) for row in x])
        
        test_persistence_diagrams = test_point_cloud.map(lambda x: [persistent_homology(row) for row in x])

        # Step 5. Separate the H0 and H1 structures in the persistence diagram.
        
        train_H0 = train_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        train_H1 = train_persistence_diagrams.map(lambda x: [row[1] for row in x])

        test_H0 = test_persistence_diagrams.map(lambda x: [row[0] for row in x])
        
        test_H1 = test_persistence_diagrams.map(lambda x: [row[1] for row in x])

        # Step 6. Feature Extraction !

        features_train_H0 = train_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_train_H1 = train_H1.map(lambda x: [get_diagram_features(row) for row in x])

        features_test_H0 = test_H0.map(lambda x: [get_diagram_features(row) for row in x])
        
        features_test_H1 = test_H1.map(lambda x: [get_diagram_features(row) for row in x])

        # Step 7. Use the itertools package to merge the features of each window into a 1-dimensional array.
        
        features_train_H0 = features_train_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_train_H1 = features_train_H1.map(lambda x: list(itertools.chain(*x)))

        features_test_H0 = features_test_H0.map(lambda x: list(itertools.chain(*x)))
        
        features_test_H1 = features_test_H1.map(lambda x: list(itertools.chain(*x)))

        # Step 8. Convert all features of H0 and H1 into a DataFrame format, and merge them.
        
        features_train_H0 = pd_series_to_dataframe(features_train_H0)
        
        features_train_H1 = pd_series_to_dataframe(features_train_H1)

        features_test_H0 = pd_series_to_dataframe(features_test_H0)
        
        features_test_H1 = pd_series_to_dataframe(features_test_H1)

        features_train = pd.concat([features_train_H0, features_train_H1], axis=1)   #（主要解決兩個 Data Frame 合併後，列名重複問題！）
        
        features_train.columns = np.arange(features_train.shape[1])
        
        features_test = pd.concat([features_test_H0, features_test_H1], axis=1)
        
        features_test.columns = np.arange(features_test.shape[1])

        # Step 9. Convert all features in each row into a list format.
        
        train_features = []
        
        features_train = features_train.values.tolist()
        
        for row in features_train:
            
            train_features.append(row)

        test_features = []
        
        features_test = features_test.values.tolist()
        
        for row in features_test:
            
            test_features.append(row)

        # Step 10. Combine all features into the original DataFrame.
        
        train_data['features'] = train_features
        
        test_data['features'] = test_features

        # Step 11. Machine Learning
        
        train_features = np.vstack(train_data['features'].values)
        
        train_target = train_data['target'].values
        
        test_features = np.vstack(test_data['features'].values)
        
        test_target = test_data['target'].values

        bc_accuracy = wine_classification_model(train_features, train_target, test_features, test_target)

        # Step 12. Organize the classification results into a DataFrame.
        
        classification_accuracy = [bc_accuracy]
        
        all_classification_result.append(classification_accuracy)
        
        print('視窗大小 ', size,' 跑完嚕!')

        print()

    all_classification_result = np.array(all_classification_result)                                      # 轉成 np.array
    
    all_classification_result = pd.DataFrame(all_classification_result, columns = ['BaggingClassifier'])   # 轉成 dataframe
    
    all_classification_result.index = window_size
 
    return all_classification_result


## （7）Experimental results（研究結果）

In [488]:
beef_experiment = beef_experiment(embedding_dimension = 2)

beef_experiment

Fitting 5 folds for each of 1 candidates, totalling 5 fits
最佳參數: {'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.5999999999999999
bc Test accuracy: 0.9333333333333333
視窗大小  5  跑完嚕!

Fitting 5 folds for each of 1 candidates, totalling 5 fits
最佳參數: {'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.6666666666666667
bc Test accuracy: 0.8666666666666667
視窗大小  10  跑完嚕!

Fitting 5 folds for each of 1 candidates, totalling 5 fits
最佳參數: {'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.5333333333333333
bc Test accuracy: 0.7333333333333333
視窗大小  45  跑完嚕!

Fitting 5 folds for each of 1 candidates, totalling 5 fits
最佳參數: {'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.5666666666666667
bc Test accuracy: 0.7
視窗大小  50  跑完嚕!

Fitting 5 folds for each of 1 candidates, totalling 5 fits
最佳參數: {'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.3666666666666666
bc Test accuracy: 0.5333333333333333
視窗大小  95  跑完嚕!

Fitting 5 folds for each of 1 c

Unnamed: 0,BaggingClassifier
5,93.333
10,86.667
45,73.333
50,70.0
95,53.333
100,50.0


In [489]:
with open('Beef_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     beef_experiment.to_csv(file)

In [490]:
coffee_experiment = coffee_experiment(embedding_dimension = 2)

coffee_experiment

Fitting 3 folds for each of 6 candidates, totalling 18 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 100, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9629629629629629
bc Test accuracy: 0.8928571428571429
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 6 candidates, totalling 18 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 100, 'bc__random_state': 14}
最佳交叉驗證準確率: 1.0
bc Test accuracy: 1.0
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 6 candidates, totalling 18 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 100, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9629629629629629
bc Test accuracy: 0.9285714285714286
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 6 candidates, totalling 18 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 200, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9666666666666667
bc Test accuracy: 0.9285714285714286
視窗大小  50  跑完嚕!

Fitting 3 folds for each of 6 candidates, totalling 18 fits
最佳參數: {'bc__max_samples': 0.8, 'bc__n_estimators': 200, 'bc__random_state': 14}

Unnamed: 0,BaggingClassifier
5,89.286
10,100.0
45,92.857
50,92.857
95,100.0
100,100.0


In [491]:
with open('Coffee_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     coffee_experiment.to_csv(file)

In [492]:
ham_experiment = ham_experiment(embedding_dimension = 2)

ham_experiment

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 0.8, 'bc__max_samples': 0.8, 'bc__n_estimators': 10, 'bc__random_state': 45}
最佳交叉驗證準確率: 0.5878378378378378
bc Test accuracy: 0.6952380952380952
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 0.8, 'bc__max_samples': 1.0, 'bc__n_estimators': 50, 'bc__random_state': 45}
最佳交叉驗證準確率: 0.6331331331331331
bc Test accuracy: 0.7333333333333333
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 0.8, 'bc__max_samples': 1.0, 'bc__n_estimators': 20, 'bc__random_state': 45}
最佳交叉驗證準確率: 0.6053553553553553
bc Test accuracy: 0.8
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 0.8, 'bc__max_samples': 0.8, 'bc__n_estimators': 10, 'bc__random_state': 45}
最佳交叉驗證準確率: 0.6418918918918919
bc Test accuracy: 0.7428571428571429
視窗大小  50  跑完嚕!

Fitting 3 folds for each

Unnamed: 0,BaggingClassifier
5,69.524
10,73.333
45,80.0
50,74.286
95,64.762
100,66.667


In [493]:
with open('Ham_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     ham_experiment.to_csv(file)

In [494]:
meat_experiment = meat_experiment(embedding_dimension = 2)

meat_experiment

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 10, 'bc__random_state': 42}
最佳交叉驗證準確率: 0.9833333333333334
bc Test accuracy: 0.95
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 10, 'bc__random_state': 42}
最佳交叉驗證準確率: 1.0
bc Test accuracy: 0.95
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.8, 'bc__n_estimators': 10, 'bc__random_state': 42}
最佳交叉驗證準確率: 1.0
bc Test accuracy: 0.95
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.8, 'bc__n_estimators': 20, 'bc__random_state': 42}
最佳交叉驗證準確率: 1.0
bc Test accuracy: 0.95
視窗大小  50  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples'

Unnamed: 0,BaggingClassifier
5,95.0
10,95.0
45,95.0
50,95.0
95,98.333
100,100.0


In [495]:
with open('Meat_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     meat_experiment.to_csv(file)

In [496]:
oliveoil_experiment = oliveoil_experiment(embedding_dimension = 2)

oliveoil_experiment

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.7666666666666666
bc Test accuracy: 0.9
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.6, 'bc__n_estimators': 10, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.8333333333333334
bc Test accuracy: 0.9666666666666667
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 100, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9
bc Test accuracy: 0.9333333333333333
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.6, 'bc__n_estimators': 30, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9
bc Test accuracy: 0.9666666666666667
視窗大小  50  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 0.8, 'bc__max_samples': 1.0, 'bc__n_estimators': 10, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9
bc Test accuracy: 0.9666666666666667
視窗大小  95  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 10, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.8666666666666667
bc Test accuracy: 0.9666666666666667
視窗大小  100  跑完嚕!



Unnamed: 0,BaggingClassifier
5,90.0
10,96.667
45,93.333
50,96.667
95,96.667
100,96.667


In [497]:
with open('Oliveoil_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     oliveoil_experiment.to_csv(file)

In [498]:
strawberry_experiment = strawberry_experiment(embedding_dimension = 2)

strawberry_experiment

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 20, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9509963334927467
bc Test accuracy: 0.9783783783783784
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 20, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9412083532600032
bc Test accuracy: 0.9567567567567568
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits




最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.8, 'bc__n_estimators': 30, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9248923959827833
bc Test accuracy: 0.9432432432432433
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.6, 'bc__n_estimators': 20, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.9200063765343535
bc Test accuracy: 0.927027027027027
視窗大小  50  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 1.0, 'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.8955364259524948
bc Test accuracy: 0.9108108108108108
視窗大小  95  跑完嚕!

Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳參數: {'bc__max_features': 1.0, 'bc__max_samples': 0.8, 'bc__n_estimators': 50, 'bc__random_state': 14}
最佳交叉驗證準確率: 0.8971544715447154
bc Test accuracy: 0.9081081081081082
視窗大小  100  跑完嚕!



Unnamed: 0,BaggingClassifier
5,97.838
10,95.676
45,94.324
50,92.703
95,91.081
100,90.811


In [499]:
with open('Strawberry_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     strawberry_experiment.to_csv(file)

In [500]:
wine_experiment = wine_experiment(embedding_dimension = 2)

wine_experiment

Fitting 3 folds for each of 12 candidates, totalling 36 fits
最佳參數: {'bc__max_samples': 0.6, 'bc__n_estimators': 30, 'bc__random_state': 3}
最佳交叉驗證準確率: 0.7017543859649122
bc Test accuracy: 0.8333333333333334
視窗大小  5  跑完嚕!

Fitting 3 folds for each of 12 candidates, totalling 36 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 30, 'bc__random_state': 3}
最佳交叉驗證準確率: 0.7894736842105264
bc Test accuracy: 0.9259259259259259
視窗大小  10  跑完嚕!

Fitting 3 folds for each of 12 candidates, totalling 36 fits
最佳參數: {'bc__max_samples': 0.8, 'bc__n_estimators': 10, 'bc__random_state': 3}
最佳交叉驗證準確率: 0.6140350877192983
bc Test accuracy: 0.6481481481481481
視窗大小  45  跑完嚕!

Fitting 3 folds for each of 12 candidates, totalling 36 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 100, 'bc__random_state': 3}
最佳交叉驗證準確率: 0.6842105263157895
bc Test accuracy: 0.7037037037037037
視窗大小  50  跑完嚕!

Fitting 3 folds for each of 12 candidates, totalling 36 fits
最佳參數: {'bc__max_samples': 1.0, 'bc__n_estimators': 

Unnamed: 0,BaggingClassifier
5,83.333
10,92.593
45,64.815
50,70.37
95,83.333
100,79.63


In [501]:
with open('Wine_Research_result.csv',  'w', encoding = 'utf = 8') as file:
    
     wine_experiment.to_csv(file)