In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

# Add library's path to notebook
import os
import sys
sys.path.append('../../time-series-featurizer/')
from ts_featurizer import TimeSeriesFeaturizer
%matplotlib inline
raw_files = glob('/media/joanes/0BB3-1FA1/CSV_DATA/*.csv')

In [2]:

def get_df_list_and_target(files):
	# Get only those values that are available before the analysis
	filter_cols = ['Total_UnfilledZones', 'Total_FillingQuality', 'TOTAL_QUALITY', 'Total_PorosityQuantity',
				   'Total_PorosityQuality', 'Time']
	df_list = list()
	target = list()
	for file in files:
		df = pd.read_csv(file)
		target.append(df.TOTAL_QUALITY.unique()[0])
		df_filtered = df.drop(axis=1, columns=filter_cols)
		filter_col = [col for col in df_filtered if
					  not col.endswith(('VoidContent', 'VoidQuality', 'Filling', 'FillingQuality'))]
		df_filtered = df_filtered[filter_col]
		df_list.append(df_filtered)

	target = pd.DataFrame(target, columns=['valid'])
	return df_list, target



In [3]:
import peakutils
from scipy.signal import savgol_filter


def get_processed_df_list_and_target(raw_files, start, end):
    df_ret = []
    df_list, targets = get_df_list_and_target(raw_files[start:end])
    for i, data in enumerate(df_list):
        flow_rate_filtered = savgol_filter(data['Flow rate'].values, 51, 2)
        flow_rate_deriv = np.diff(flow_rate_filtered) * -1
        flow_rate = flow_rate_deriv[350:500]
        np.place(flow_rate, flow_rate < 0, [0])
        indexes = peakutils.indexes(flow_rate, thres=0.08, min_dist=10) + 350
        flow_rate = flow_rate_deriv[550:]
        np.place(flow_rate, flow_rate < 0, [0])
        indexes2 = peakutils.indexes(flow_rate, thres=0.2, min_dist=10) + 550
        df_ret.append(data[indexes[0]:indexes2[0]])
          

    return df_ret, targets

In [None]:
df_list, target_featurized = get_processed_df_list_and_target(raw_files, 3000, 4000)


In [None]:

df_list, target_featurized = get_processed_df_list_and_target(raw_files, 0,100)
print('Tseries loaded')


tseries = TimeSeriesFeaturizer(check_na=False)
model = tseries.featurize(df_list, n_jobs=4)


In [None]:

import pickle

filehandler = open('tmp/second_fold/tseries.pickle', 'wb')
pickle.dump(tseries, filehandler)


In [4]:
import pickle
with open('tmp/second_fold/tseries.pickle', 'rb') as filehandler:
	tseries = pickle.load(filehandler)


In [5]:
for time in range(10,11):
    df_list, target_featurized = get_processed_df_list_and_target(raw_files, (time - 1) * 1_000, time * 1_000)
    print(f'Loaded DataFrame lists len is {len(df_list)}, from {(time - 1) * 1_000} to {time * 1_000}')
    featurized = tseries.featurize(df_list, n_jobs=4, apply_model=True)
    featurized.reset_index(drop=True).to_feather(f'tmp/second_fold/featurized_{time}')
    target_featurized.reset_index(drop=True).to_feather(f'tmp/second_fold/target_featurized_{time}')
    print('Stored the featurized files')




Loaded DataFrame lists len is 1000, from 9000 to 10000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 29/29 [21:44<00:00, 22.44s/it]


Stored the featurized files


In [None]:
featurized.shape