In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
%matplotlib inline
raw_files = glob('/media/joanes/0BB3-1FA1/CSV_DATA/*.csv')

In [2]:

def get_df_list_and_target(files):
	# Get only those values that are available before the analysis
	filter_cols = ['Total_UnfilledZones', 'Total_FillingQuality', 'TOTAL_QUALITY', 'Total_PorosityQuantity',
				   'Total_PorosityQuality', 'Time']
	df_list = list()
	target = list()
	for file in files:
		df = pd.read_csv(file)
		target.append(df.TOTAL_QUALITY.unique()[0])
		df_filtered = df.drop(axis=1, columns=filter_cols)
		filter_col = [col for col in df_filtered if
					  not col.endswith(('VoidContent', 'VoidQuality', 'Filling', 'FillingQuality'))]
		df_filtered = df_filtered[filter_col]
		df_list.append(df_filtered)

	target = pd.DataFrame(target, columns=['valid'])
	return df_list, target



In [3]:
def split_df(df, indexes, starting_index=0):
    df_list = []
    for index, i in enumerate(indexes[:len(indexes)]):
        df_slice = df[starting_index:i].reset_index()
        df_slice.rename(columns=lambda x: f'slice_{index}_{x}', inplace=True)
        df_list.append(df_slice)
        starting_index = i
    df_slice = df[indexes[-1]:].reset_index()
    df_slice.rename(columns=lambda x: f'slice_{index + 1}_{x}', inplace=True)
    df_list.append(df_slice)
    return df_list

In [4]:
import peakutils
from scipy.signal import savgol_filter


def get_processed_df_list_and_target(raw_files, start, end):
    nidea_list = []
    df_list, targets = get_df_list_and_target(raw_files[start:end])
    print(len(raw_files[start:end]))
    for data in df_list:
        flow_rate_filtered = savgol_filter(data['Flow rate'].values, 51, 2)
        flow_rate_deriv = np.diff(flow_rate_filtered) * -1
        flow_rate = flow_rate_deriv[350:500]
        np.place(flow_rate, flow_rate < 0, [0])
        indexes = peakutils.indexes(flow_rate, thres=0.08, min_dist=10) + 350
        flow_rate = flow_rate_deriv[580:]
        np.place(flow_rate, flow_rate < 0, [0])
        indexes2 = peakutils.indexes(flow_rate, thres=0.2, min_dist=10) + 580

        df_list = split_df(data, [indexes[0], indexes2[0]])

        nidea_list.append(pd.concat(df_list, axis=1))
    return nidea_list, targets

In [6]:

# Add library's path to notebook
import os
import sys

sys.path.append('../../time-series-featurizer/')

print('Tseries loaded')
from ts_featurizer import TimeSeriesFeaturizer

tseries = TimeSeriesFeaturizer(check_na=False)
model = tseries.featurize(nidea_list[:200], n_jobs=-1)


Tseries loaded


-------------------------------------------------- Modeling started --------------------------------------------------


100%|██████████| 82/82 [1:49:21<00:00, 80.02s/it]   


In [8]:

import pickle

filehandler = open('tmp/tseries_folded.pickle', 'wb')
pickle.dump(tseries, filehandler)


In [9]:

with open('tmp/tseries_folded.pickle', 'rb') as filehandler:
	tseries = pickle.load(filehandler)


In [25]:
for time in range(4, 10):
    print(time)
    df_list, target_featurized = get_processed_df_list_and_target(raw_files, (time - 1) * 1_000, time * 1_000)
    print(f'Loaded DataFrame lists len is {len(df_list)}, from {(time - 1) * 1_000} to {time * 1_000}')
    featurized = tseries.featurize(df_list, n_jobs=-1, apply_model=True)
    featurized.reset_index(drop=True).to_feather(f'tmp/featurized_{time}')
    target_featurized.reset_index(drop=True).to_feather(f'tmp/target_featurized_{time}')
    print('Stored the featurized files')


4
1000
Loaded DataFrame lists len is 1000, from 3000 to 4000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 82/82 [8:27:43<00:00, 371.51s/it]    


Stored the featurized files
5
1000




Loaded DataFrame lists len is 1000, from 4000 to 5000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 82/82 [8:36:27<00:00, 377.89s/it]    


Stored the featurized files
6
1000




Loaded DataFrame lists len is 1000, from 5000 to 6000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 82/82 [8:37:12<00:00, 378.45s/it]    


Stored the featurized files
7
1000




Loaded DataFrame lists len is 1000, from 6000 to 7000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 82/82 [8:45:24<00:00, 656.69s/it]    


Stored the featurized files
8
1000




Loaded DataFrame lists len is 1000, from 7000 to 8000


-------------------------------------------------- Applying the model started --------------------------------------------------


100%|██████████| 82/82 [8:40:21<00:00, 380.75s/it]   


Stored the featurized files
9
1000




Loaded DataFrame lists len is 1000, from 8000 to 9000


-------------------------------------------------- Applying the model started --------------------------------------------------


Process ForkPoolWorker-117:
Process ForkPoolWorker-123:
Process ForkPoolWorker-118:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-119:
Process ForkPoolWorker-128:
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process ForkPoolWorker-115:
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process ForkPoolWorker-126:
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-122:
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-120:
Process ForkPoolWorker-113:
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
Traceback (most

  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 59, in featurize
    featurized_list.append(self._apply_featurization(function=key.function_, executions=execs))
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 70, in _featurize_all_columns
    value = func(column, *args, **kwargs)
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 120, in _apply_featurization
    trans = self._feat

  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py", line 273, in fit
    self.pinv_wexog, singular_values = pinv_extended(self.wexog)
  File "../../time-series-featurizer/ts_featurizer/tools/features.py", line 19, in series_to_ndarray
    return func(np.asarray(x), *args, **kwargs)
  File "../../time-series-featurizer/ts_featurizer/base/base.py", line 360, in featurize_each_class
    transf, prev_transf = featurizer.featurize(data, )
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 70, in _featurize_all_columns
    value = func(column, *args, **kwargs)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py", line 273, in fit
    self.pinv_wexog, singular_values = pinv_extended(self.wexog)
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 59, in featurize
    featurized_list.append(self._apply_featurization(function=key.function_, executions=execs))
 

KeyboardInterrupt
  File "../../time-series-featurizer/ts_featurizer/tools/features.py", line 468, in augmented_dickey_fuller
    res = adfuller(x)
  File "../../time-series-featurizer/ts_featurizer/tools/features.py", line 19, in series_to_ndarray
    return func(np.asarray(x), *args, **kwargs)
Process ForkPoolWorker-116:
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 120, in _apply_featurization
    trans = self._featurize_all_columns(self._transformed_data, function, args, kwargs)
  File "../../time-series-featurizer/ts_featurizer/tools/features.py", line 19, in series_to_ndarray
    return func(np.asarray(x), *args, **kwargs)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py", line 273, in fit
    self.pinv_wexog, singular_values = pinv_extended(self.wexog)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py", line 273, in fit
    self.pinv_wexog, singular_values

  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/tsa/stattools.py", line 241, in adfuller
    maxlag, autolag)
  File "../../time-series-featurizer/ts_featurizer/base/base.py", line 360, in featurize_each_class
    transf, prev_transf = featurizer.featurize(data, )
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/tools/tools.py", line 342, in pinv_extended
    u, s, vt = np.linalg.svd(X, 0)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/tsa/stattools.py", line 87, in _autolag
    results[lag] = mod_instance.fit()
  File "../../time-series-featurizer/ts_featurizer/tools/base.py", line 59, in featurize
    featurized_list.append(self._apply_featurization(function=key.function_, executions=execs))
  File "/home/joanes/miniconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/tools/tools.py", lin

  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py", line 273, in fit
    self.pinv_wexog, singular_values = pinv_extended(self.wexog)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py", line 1562, in svd
    u, s, vh = gufunc(a, signature=signature, extobj=extobj)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/statsmodels/tools/tools.py", line 342, in pinv_extended
    u, s, vt = np.linalg.svd(X, 0)
  File "/home/joanes/miniconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py", line 1562, in svd
    u, s, vh = gufunc(a, signature=signature, extobj=extobj)
KeyboardInterrupt
KeyboardInterrupt


KeyboardInterrupt: 

In [19]:
len(raw_files)

10014