# Pipleline Step 4: Generate data sequences to train the model with 

In [4]:
import pandas as pd
import numpy as np
import glob
import multiprocessing as mp
import pickle
np.set_printoptions(suppress=True)

In [5]:
merged = pd.read_pickle('merged.p')
seq_length = 12

In [6]:
# Create 12 month sequences of production values for a given well, identified by PWT ID. 
# Given the production values of the past 12 months in these sequences, the model will predict the next month's value. 
def create_sequence(PWT):
    temp = merged['PWT__ID' == PWT]
    temp['ProductionDate'] = pd.to_datetime(temp['ProductionDate'])
    temp.sort_values('ProductionDate', inplace=True)
    l = temp[temp['PWT__ID'] == PWT]['OilorCondensateProduced'].tolist() # for each PWT ID get list of oil production
    if len(l) < 100:
        return
    m = len(l)
    xx, yy = [], []
    for i in range(m-(seq_length+1)):  #create sequences
        x_i = l[i:i+seq_length]
        y_i = l[i+seq_length]
        if np.sum(np.isnan(x_i)) > 0 or np.sum(np.isnan(y_i)) > 0 or (np.count_nonzero(x_i) < seq_length) or (np.count_nonzero(y_i) == 0):
            continue
        xx.append(x_i)
        yy.append(y_i)
    return np.array(xx), np.array(yy)

In [None]:
# Use pooling to speed up the process. 
pool = mp.Pool(8)
PWTs = merged['PWT__ID'].unique()
dfs = [pool.apply_async(create_sequence, args=(PWT, )) for PWT in PWTs]
pool.close()
pool.join()

In [None]:
X = []
y = []
for p in dfs:
    try:
        x1, y1 = p.get()
        x1 = x1.reshape(x1.shape[0], x1.shape[1], 1)
        y1 = y1.reshape(y1.shape[0], 1)
        X.append(x1)
        y.append(y1)
    except:
        pass

In [None]:
X_all = np.vstack(X)
y_all = np.vstack(y)
print(X_all.shape)
print(y_all.shape)

In [None]:
np.save('x_complete', x_all)
np.save('y_complete', y_all)