In [1]:
#Importing needed libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import pandas as pd
import dask.dataframe as dd
from tqdm import tqdm
from scipy.fft import fft,fftfreq,rfft, rfftfreq
from scipy import signal
import itertools
#import ray
#ray.init(ignore_reinit_error=True)

In [2]:
#Making list for folders needed to be imported
tests=['1st_test','2nd_test','3rd_test']

#Making list for files to be imported in each folder
files={'1':[],'2':[],'3':[]}

for count,test in enumerate(tests):
    for dirname, _, filenames in os.walk('data/'+test):
        for filename in filenames:
            files[str(count+1)].append(os.path.join(dirname, filename))

In [3]:
def create_df(file,channel):
    """
    Creates a DataFrame from the specified file and channel.
    
    Args:
      file: The path to the file.
      channel: The channel number.
    
    Returns:
      A DataFrame containing the detrended, windowed, and Fourier transformed signal.
    """
    # Read the data from the file.
    if file[5:13]=='1st_test': df = pd.read_csv(file, sep='\t', header=None,names=list(range(1,9)))
    else: df = pd.read_csv(file, sep='\t', header=None,names=list(range(1,5)))
        
    # Get the signal from the DataFrame.
    N=len(df)
    y=df[channel]
    # Detrend the signal.
    y_detrend = y - np.mean(y,axis=0)
    # Detrend the signal again using a constant trend.
    y_detrend = signal.detrend(y_detrend, type="constant",axis=0)
    # Window the signal using Kaiser Window.
    y_detrend *= np.kaiser(len(y_detrend), 3)[:, None]
    # Compute the Fourier transform of the signal.
    yf = rfft(y_detrend,axis=0)
    # Compute the absolute value of the Fourier transform.
    yf = 2.0 / N * np.abs(yf[: int(N / 2.0)])
    # Find the maximum value of the Fourier transform in each time window.
    yf=np.max(yf.reshape(-1,N//40,len(channel)), axis=1)
    # Create a DataFrame from the Fourier transform.
    yf=pd.DataFrame(yf,columns=[pd.to_datetime(file[-19:], format='%Y.%m.%d.%H.%M.%S')]*len(channel)).T
    # Split the DataFrame into separate DataFrames for each channel.
    yf=np.split(yf,len(channel),axis=0)
    
    return yf

def load_dataset(channels):
    """Loads the dataset for the specified channels.

    Args:
      channels: A list of channels to load.

    Returns:
      A tuple of DataFrames, where each DataFrame contains the data for the specified channels.
    """

    # Create a dictionary to store the test files for each channel.
    tests = {"1": [], "2": [], "3": []}
    
    # Iterate over the channels.
    for channel in channels:
        # Get the test number and channel number and add the file to the list of files for the test.
        tests[channel[1]].append(int(channel[-1]))

    dfs=[]    
    # Iterate over the tests.
    for test in tests:
        # Check if there are any files for the test.
        if tests[test] == []: continue
        for file in tqdm(files[test], desc="Loading test "+test,ascii=False, ncols=100):
            # Create a DataFrame from the files for the test.
            dfs.append(create_df(file,tests[test]))

    # Concatenate the DataFrames in the output list.
    dfs = concatenate_by_element(dfs, tests)
    
    # This code first creates the y dataset for x.
    # Then, it loops over the `dfs` list and for each DataFrame, it calculates the following:
    #     * The total time the DataFrame covers, in days.
    #     * The running time of the DataFrame, in days.
    #     * The life percentage of the DataFrame, which is the running time divided by the total time.
    #     * The remaining useful life (RUL) of the DataFrame, which is the total time minus the running time.
    # It then appends a DataFrame with these columns to the `y_lists` list.

    y_lists = []
    for i in range(len(dfs)):
    
        total_time = (max(dfs[i].index) - min(dfs[i].index)) / np.timedelta64(1, 'D')
        run_time = (dfs[i].index - min(dfs[i].index)) / np.timedelta64(1, 'D')
        life_percent = run_time / total_time
        RUL = total_time - run_time
        y_lists.append(pd.DataFrame([run_time, life_percent, RUL]).T)
    
    # Next, it creates an empty list called `output`.
    # Then, it loops over the `dfs` list and for each DataFrame, it does the following:
    #     * Resets the index of the DataFrame to start at 0.
    #     * Appends the DataFrame to the `output` list.
    #     * Appends the corresponding y_list from the `y_lists` list to the `output` list.
    
    output = []
    for i in range(len(dfs)):
        df = dfs[i].reset_index(drop=True)
        output.append(df)
        output.append(y_lists[i])

    
    # Return the tuple of DataFrames.
    return tuple(output)

def concatenate_by_element(dfs, tests):
    """Concatenates the DataFrames in the specified list by element.

    Args:
      dfs: A list of DataFrames.
      channels: A list of channels, where each channel corresponds to a DataFrame in `dfs`.

    Returns:
      A tuple of DataFrames, where each DataFrame contains the data for the specified channels.
    """

    # Initialize the output list.
    new_list=[]
    df_list=[]
    
    # This loop takes the `dfs` list and for each sublist, appends each element to the `df_list` list.
    # Then, it concatenates the `df_list` list into a single DataFrame called `df`.
    for df_sublist in dfs:
        for i in range(len(df_sublist)):
            df_list.append(df_sublist[i])
    df = pd.concat(df_list)
    
    
    # Next, it creates two empty lists, `indexes` and `output`.
    # Then, it creates a dictionary called `sizes` that maps each test name to the number of rows in the corresponding df.
    indexes=[]
    output=[]
        
    sizes = {"1": 2156,
             "2": 984, 
             "3": 6324}
    
    # It then creates a dictionary called `initial` that maps each test name to the starting index of the rows
    # for that test in the `df` DataFrame.
    initial={"1":0,
             "2":len(tests['1'])*2156,
             "3":len(tests['1'])*2156+len(tests['2'])*984 }
    
    # For each test, it loops over the number of rows in that test and creates a list of indexes that
    # correspond to the rows for that test in the `df` DataFrame.
    # It then appends this list of indexes to the `indexes` list.
    for test in tests:
        size=sizes[test]
        for i in range(len(tests[test])):
            indexes.append(np.arange(initial[test]+i,initial[test]+size*len(tests[test]),len(tests[test])))
    
    # Finally, it loops over the `indexes` list and for each index, it gets the corresponding rows from
    # the `df` DataFrame and appends them to the `output` list.
    for i in indexes:
        output.append(df.iloc[i])

    # Return the tuple of DataFrames.
    return output



In [4]:
(x_train,y_train,
 x_test ,y_test)=load_dataset(['t2_ch1',
                               't2_ch3',])

Loading test 2: 100%|█████████████████████████████████████████████| 984/984 [00:21<00:00, 45.59it/s]


In [5]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.008234,0.028855,0.003254,0.002746,0.001681,0.002006,0.002920,0.002715,0.003190,0.002944,0.001780,0.001467,0.001492,0.000991,0.000916,0.001075,0.000857,0.000637,0.000733,0.000560
1,0.008635,0.025182,0.002786,0.002110,0.001650,0.002023,0.002429,0.002358,0.004077,0.003203,0.001863,0.001464,0.001448,0.001094,0.000788,0.001098,0.000859,0.000742,0.000995,0.000579
2,0.008146,0.025087,0.002574,0.002258,0.001866,0.001720,0.002471,0.002593,0.004254,0.003022,0.001991,0.001716,0.001382,0.001116,0.000824,0.001014,0.000903,0.000767,0.001540,0.000491
3,0.008223,0.025257,0.003100,0.002689,0.001828,0.002035,0.003450,0.002768,0.003874,0.004212,0.001880,0.001471,0.001273,0.000961,0.001002,0.000976,0.000784,0.000870,0.001456,0.000569
4,0.007614,0.024457,0.002524,0.003066,0.001798,0.001771,0.003560,0.002322,0.003863,0.003059,0.001933,0.001512,0.001503,0.000851,0.000911,0.001300,0.000843,0.000844,0.001416,0.000690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,0.021233,0.047120,0.049485,0.105804,0.019210,0.027885,0.037673,0.034314,0.118329,0.051702,0.021699,0.026780,0.017412,0.012672,0.009192,0.008450,0.007004,0.007173,0.014474,0.003909
980,0.018261,0.059374,0.033499,0.050604,0.020528,0.032882,0.049221,0.038431,0.092709,0.040297,0.029143,0.019297,0.009911,0.012335,0.005227,0.005951,0.003494,0.005027,0.014361,0.003731
981,0.012352,0.061241,0.019024,0.060038,0.017227,0.038593,0.064996,0.053471,0.039690,0.066722,0.016833,0.013954,0.015879,0.010074,0.004737,0.009575,0.004867,0.005502,0.014164,0.003106
982,0.000393,0.000063,0.000036,0.000083,0.000031,0.000022,0.000057,0.000025,0.000027,0.000082,0.000114,0.000092,0.000018,0.000024,0.000036,0.000024,0.000019,0.000028,0.000074,0.000123


In [6]:
y_train

Unnamed: 0,0,1,2
0,0.000000,0.000000,6.826389
1,0.006944,0.001017,6.819444
2,0.013889,0.002035,6.812500
3,0.020833,0.003052,6.805556
4,0.027778,0.004069,6.798611
...,...,...,...
979,6.798611,0.995931,0.027778
980,6.805556,0.996948,0.020833
981,6.812500,0.997965,0.013889
982,6.819444,0.998983,0.006944
