In [1]:
%sh 

cwd=$(pwd)
does_my_git_repo_exist="$cwd/acse-9-independent-research-project-kkf18"

if [ -d $does_my_git_repo_exist ] 
then
  echo "Git repo exists!"
  echo "Pulling master branch..."
  cd acse-9-independent-research-project-kkf18
  git pull origin master
else
  git clone https://github.com/msc-acse/acse-9-independent-research-project-kkf18.git
  echo "Git repo does not exist!"
  echo "Cloning repo..."
fi

In [2]:
%sh 

cd acse-9-independent-research-project-kkf18
git branch -a
git checkout Exploratory_Data_Analysis
git pull origin Exploratory_Data_Analysis

In [3]:
import os
import sys
import importlib.util
from pathlib import Path


# Add repo path to our sys.path for importing modules from repo.
def import_mod(module_name):
  cwd = os.getcwd()
  my_git_repo_exists = Path('{}/acse-9-independent-research-project-kkf18'.format(cwd))

  spec = importlib.util.spec_from_file_location("{}.py".format(module_name), "{}/{}.py".format(my_git_repo_exists, module_name))
  module = importlib.util.module_from_spec(spec)
  spec.loader.exec_module(module)
  
  # load module into the sys module dictionary so it can be imported in
  sys.modules[module_name] = module
  
  print("Import successful")
  
  assert module_name in sys.modules.keys()


In [4]:
# Spark packages
from pyspark.sql import functions as F

# Standard packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Homemade Modules
import_mod("Data_Engineering")
import Data_Engineering as DET

In [5]:
# Load datas into notebook
df_01 = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/newdump_01.csv')
df_02 = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/newdump_02.csv')
df_OW1ql = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/Qliq.csv')
df_records = spark.read.format('csv').options(header='true', inferSchema='true', delimiter='|', encoding='iso-8859-1').load('/FileStore/tables/interences_filtered.csv')

# Dumped Data:
# Rename and cast types for each column
df_01 = df_01.select(
      df_01["Unnamed: 0"].alias("index"),
      F.to_timestamp(F.col("ts").cast("string"), "dd-MMM-yy HH:mm:ss").alias("datetime"),
      df_01["name"].alias("tag"),
      df_01["value"]
)

df_02 = df_02.select(
      df_02["Unnamed: 0"].alias("index"),
      F.to_timestamp(F.col("ts").cast("string"), "dd-MMM-yy HH:mm:ss").alias("datetime"),
      df_02["name"].alias("tag"),
      df_02["value"]
)

# Clean up data using Data Engineering Tools
DataEng = DET.GroupDataTools(df_01)
DataEng.append_data(df_02)
DataEng.is_null(DataEng.df)
DataEng.df = DataEng.null2zero("value", DataEng.df)
DataEng.is_null(DataEng.df)

# Tag Simplification
DataEng.groupdata("tag", "datetime", "value")

r1 = re.compile('BRA-....-..-07.')
r2 = re.compile('BRA-QT  -15-0077-RAW')
OW1 = DataEng.splitdata_dict([r1, r2])

r1 = re.compile('BRA-....-..-01.')
r2 = re.compile('BRA-QT  -15-0017-RAW')
OW3 = DataEng.splitdata_dict([r1, r2])

r1 = re.compile('BRA-....-..-04.')
OW2 = DataEng.splitdata_dict([r1])

# Make a tag dictionary: Decode the tags!
tag_names = {
              "BRA-PZT........" : "WHP",
              "BRA-TT..-15...." : "WHT",
              "BRA-FI........." : "GLR",
              "BRA-PT..-16...." : "GLP",
              "BRA-PT..-13...." : "DHP",
              "BRA-TT..-13...." : "DHT",
              "BRA-HV........." : "Choke",
              "BRA-QT........." : "ASD"
}

OW1 = DataEng.decode_keys(OW1, tag_names)
OW2 = DataEng.decode_keys(OW2, tag_names)
OW3 = DataEng.decode_keys(OW3, tag_names)


# Liquid Rate Data:
# Add the additional columns to the oilwell dictionary
OW1["LR"] = df_OW1ql.select(
                                   F.to_timestamp(F.col("DATE").cast("string"), "MM/dd/yyyy").alias("datetime"),
                                    df_OW1ql["Daily liquid rate [Sm3/d]"].alias("value")
)

OW1["OR"] = df_OW1ql.select(
                                   F.to_timestamp(F.col("DATE").cast("string"), "MM/dd/yyyy").alias("datetime"),
                                    df_OW1ql["Daily oil [Sm3/d]"].alias("value")
)


# Interference Records Data:
# Translate column to english!
df_records, trans_dict = DataEng.translate_col("Description", df_records)

# Pick out only date and description columns
OW1["Records"] = df_records.select(
                 F.to_timestamp(F.col("Date").cast("string"), "MM/dd/yyyy").alias("datetime"),
                 df_records["Description"]
)

print("\nRaw data organised and imported as dictionaries: OW1, OW2, OW3")


# recorded interfaces begin 2017-01-20
OW1["WHP_2017_2019"] = OW1["WHP"].where(OW1["WHP"].datetime >= '2017-01-20')

abrv_dict = {
            "due to sand" : "Choking due to sand",
            "Valve test" : "Valve test"
}

# Overlay time series graph with the records
OW1["WHP_2017_2019"] = DataEng.ts_overlay_records(OW1["WHP_2017_2019"], OW1["Records"], "Description", filt_dict=abrv_dict, translate_dict=trans_dict)

# Remove any duplicates, and merge any related comments
OW1["WHP_2017_2019"] = DataEng.merge_duplicate(OW1["WHP_2017_2019"], sqlContext)

# Produce a discretisation of the columns
OW1["WHP_2017_2019"] = DataEng.discretise_col(OW1["WHP_2017_2019"], "Grouped")



In [6]:
# Example of how to plot OW dictionaries
ts_dfs = [OW1["DHP"], OW1["WHP"], OW1["DHT"], OW1["WHT"], OW1["GLP"]]
ts_labels = ["DHP", "WHP", "DHT", "WHT", "GLP"]

ge2016 = [df.where(df.datetime >= '2016-01-01') for df in ts_dfs]

fig = DataEng.plot_ts("WHP, WHT, DHP, DHT, GLP over time", "datetime", "value", ge2016, ts_labels)
display(fig)

In [7]:
# Try to find the interventions thats occurred over the years
ts_dfs = [OW1["DHP"], OW1["WHP"], OW1["LR"]]
ts_labels = ["DHP", "WHP", "LR"]

ge2016norm = [df.where(df.datetime >= '2016-01-11') for df in ts_dfs]

for i, df in enumerate(ge2016norm):
  mean, std = df.select(F.mean("value"), F.stddev("value")).first()
  ge2016norm[i] = df.withColumn("value_norm", (F.col("value")) / std)
  ge2016norm[i] = ge2016norm[i].select(ge2016norm[i]["datetime"], ge2016norm[i]["value_norm"].alias("value"))

fig = DataEng.plot_ts("Standardised WHP, DHP, LR over time", "datetime", "value", ge2016norm, ts_labels)
display(fig)

In [8]:
ts_dfs = [OW1["DHP"], OW1["WHP"], OW1["LR"]]
ts_labels = ["DHP", "WHP", "LR"]

ge2016norm = [df.where(df.datetime >= '2016-01-11') for df in ts_dfs]

ts_df = OW1["WHP"].where(OW1["WHP"].datetime >= '2017-01-20')
overlay_df = OW1["WHP_2017_2019"]
years = ['2017', '2018', '2019']

fig = DataEng.plot_ts("Overlaid records for one parameter WHP", "datetime", "value", [ts_df], ["WHP"], overlay="Grouped", overlay_dfs=overlay_df, plot_yearly=years)

display(fig)

In [9]:
test_thresh = DataEng.threshold(OW1["WHP_2017_2019"], 0.8)
test_avg = DataEng.avg_over_period(test_thresh, "week")

ts_labels = ["Orig", "WHT_thresholded", "WHT_thresholded_avg"]
ts_dfs = [OW1["WHP_2017_2019"], test_thresh, test_avg]
fig = DataEng.plot_ts("WHT_thresholded_avg", "datetime", "value", ts_dfs, ts_labels, overlay="Grouped", overlay_dfs=OW1["WHP_2017_2019"])

display(fig)

In [10]:
years = ['2017', '2018', '2019']
fig = DataEng.plot_ts("WHT_thresholded_avg", "datetime", "value", ts_dfs, ts_labels, overlay="Grouped", overlay_dfs=OW1["WHP_2017_2019"], plot_yearly=years)
display(fig)

In [11]:
test_thresh = DataEng.threshold(OW1["WHP_2017_2019"], 0.8)
test_avg = DataEng.avg_over_period(test_thresh, "week")

ts_labels = ["Orig", "WHT_thresholded", "WHT_thresholded_avg"]
ts_dfs = [OW1["WHP_2017_2019"], test_thresh, test_avg]

years = ['2017', '2018', '2019']
fig = DataEng.plot_ts("WHT_thresholded_avg", "datetime", "value", ts_dfs, ts_labels, overlay="Grouped", overlay_dfs=OW1["WHP_2017_2019"], plot_quarterly=years)
display(fig)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

from livelossplot import PlotLosses
from pycm import *

import torch
import torch.nn as nn
import torch.nn.functional as nnF
import csv
from torch.utils.data import TensorDataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.data import Dataset 
import random
import itertools
from sklearn.model_selection import StratifiedKFold

import seaborn as sns


def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled   = False

    return True

device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")

def correlate(data):
  fig, ax = plt.subplots(figsize=(14,12))
  colormap = plt.cm.RdBu
  ax.set_title('Pearson Correlation of Features', y=1.05, size=15)
  sns.heatmap(data.corr(), linewidths=0.1,vmax=1.0, 
              square=True, cmap=colormap, linecolor='white', annot=True, ax=ax)

  display(fig)

In [13]:
test_separator_data = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/testseparator_1.csv')
test_OW1 = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/OW1test_1.csv')

test_OW1 = test_OW1.select(F.to_timestamp(F.col("DATE").cast("string"), "MM/dd/yyyy HH:mm").alias("datetime"), *[n for n in test_OW1.schema.names if not n == "DATE"])
test_OW1 = test_OW1.where(test_OW1.WHT != "-")
test_OW1 = test_OW1.withColumn("WHT_", test_OW1["WHT"].cast("double"))
test_OW1 = test_OW1.drop("WHT").withColumnRenamed("WHT_", "WHT")

print(test_OW1.count())

In [14]:
correlate(test_OW1.toPandas())

In [15]:
# OW1 = { n : test_OW1.select(F.to_timestamp(F.col("DATE").cast("string"), "MM/dd/yyyy HH:mm").alias("datetime"), test_OW1[n].alias("value")) for n in test_OW1.schema.names}
# OW1 = { head : df.na.drop() for head, df in OW1.items() }

In [16]:
display(test_OW1.withColumn("value", test_OW1["Q liq"]))

In [17]:
def dataframe2dictionary(df, date_head="datetime"):
  """
  Convert a dataframe of collected timeseries features into an organised dictionary. key=feature name : value=timeseries data of feature
  
  """
  
  assert date_head in df.schema.names, "no date column in given dataframe!"
  
  OW_dict = {}
  
  for head in df.schema.names:
    if head == date_head: continue
    OW_dict[head] = df.select(df[date_head].alias("datetime"), df[head].alias("value"))
  
  return OW_dict


In [18]:
def dictionary2dataframe(OW_dict):
  """Convert a dictionary of timeseries features into a single dataframe of collected features.
  
  """
  
  dfs = OW_dict[list(OW_dict.keys())[0]]
  dfs = dfs.select(dfs["datetime"], dfs["value"].alias(list(OW_dict.keys())[0]))
  
  for i, (head, df) in enumerate(OW_dict.items()):
    if i == 0: continue
    df = df.select(df["datetime"], df["value"].alias(head))
    dfs =  dfs.join(df, df.datetime == dfs.datetime, how="left").drop(df.datetime)
  
  return dfs



In [19]:
OW_t = dataframe2dictionary(test_OW1)
print(OW_t)

In [20]:
print(list(OW_t.keys())[0])

In [21]:
display(dictionary2dataframe(OW_t))

In [22]:
display(test_OW1)

In [23]:
# Need a function to convert collected dataframe to dictionaries
# Need a function to convert dictionaries into collected dataframe

In [24]:
test_OW1 = test_OW1.withColumn("value", test_OW1["Q liq"])
test_thresh = DataEng.threshold(test_OW1, 0.8)

In [25]:
# normalise the data
orig_heads = test_thresh.schema.names # schema names gets updated dynamically! So must be separate from the loop

for head in orig_heads:
  if head == "datetime" or head == "WELLNAME" or head == "value": continue
  mean, std = test_thresh.select(F.mean(head), F.stddev(head)).first()
  test_thresh = test_thresh.withColumn("{}_".format(head), (F.col(head)-mean)/std)
  test_thresh = test_thresh.drop(head)
  test_thresh = test_thresh.withColumnRenamed("{}_".format(head), head)
  
display(test_thresh)

In [26]:
ts_labels = ["Orig", "Thresholded"]
ts_dfs = [test_OW1, test_thresh]
fig = DataEng.plot_ts("Thresholded", "datetime", "value", ts_dfs, ts_labels)
display(fig)

In [27]:
display(test_thresh)

In [28]:
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler

pred_OW1 = test_thresh.select("datetime", "value", "WHP", "WHT", "Choke", "pbh", "Qgl").na.drop().toPandas()
X = pred_OW1[["WHP", "WHT", "Choke", "pbh", "Qgl"]].values
y = np.squeeze(pred_OW1[["value"]].values)

shuffler = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42).split(X, y)
indices = [(train_idx, validation_idx) for train_idx, validation_idx in shuffler]   # indices[split][train, test]

X_train, X_val = X[indices[0][0]], X[indices[0][1]]
y_train, y_val = y[indices[0][0]], y[indices[0][1]]

date = np.squeeze(pred_OW1[["datetime"]].values)
date_train, date_val = date[indices[0][0]], date[indices[0][1]]

# print(X_train)
# pred_OW1 = StandardScaler().fit_transform(pred_OW1)
# print(pred_OW1)
# print("Accuracy: %2.1f percent" % (accuracy_score(y, y_pred)*100))

In [29]:
def kfold_datasets(n_splits, X_train_orig, y_train_orig, trans=True, verbatim=False):
  '''Wrapper function that returns a list of train/val datasets which have been 
     subjected to the KFold method. 
  '''
  
  kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

  fold_train_loader = [] # list of shuffled training dataloaders
  fold_validation_loader = [] # list of shuffled validation dataloaders

  for train_index, test_index in kf.split(X_train_orig, y_train_orig):
    if verbatim: print("TRAIN:", train_index, "Validation:", test_index)
    X_train, X_val = X_train_orig[train_index], X_train_orig[test_index]
    y_train, y_val = y_train_orig[train_index], y_train_orig[test_index]

    if verbatim: print("train size:", X_train.shape, "test size:", X_val.shape)

    # Convert to tensor
    X_train, y_train = torch.from_numpy(X_train).float(), torch.from_numpy(y_train)
    X_val, y_val = torch.from_numpy(X_val).float(), torch.from_numpy(y_val)
    
    # Find mean std
    mean1, std1 = torch.mean(X_train), torch.std(X_train)

    # make Custom set
    train_dataset = CustomImageTensorDataset(X_train, y_train.long(), transform=trans, mean=mean1, std=std1)
    validation_dataset = CustomImageTensorDataset(X_val, y_val.long(), transform=False, mean=mean1, std=std1)

    # initialize the data-loaders
    fold_train_loader.append(DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4))
    fold_validation_loader.append(DataLoader(validation_dataset, batch_size=test_batch_size, shuffle=False, num_workers=0))
    
def train_model_kfold(wd, lrt, fold_train_loader, fold_validation_loader):
  """ function to easily train the model with weight_decay as input parameter.
  
      HOW TO RUN THE FUNCTION:
      fold_train_loader, fold_validation_loader = kfold_datasets(5, X_train_orig, y_train_orig, False)
      lloss, loss, acc = train_model_kfold(weight_decay[3], fold_train_loader, fold_validation_loader)
  """
  
  fold_liveloss = []
  fold_loss = 0.
  fold_acc = 0.
  for fold in range(len(fold_train_loader)):
    # CHANGE THE MODEL HERE:
    model = LeNet5()

    liveloss, val_loss, val_acc = train_model(wd, lrt, model, fold_train_loader[fold], fold_validation_loader[fold])
    fold_liveloss.append(liveloss)
    fold_loss += val_loss
    fold_acc += val_acc
    print("fold:", fold)
    
  print("Averaged Accuracy: ", (fold_acc/len(fold_train_loader))*100)
  return fold_liveloss, fold_loss, fold_acc

In [30]:
# pred_OW1 = test_OW1.select("Q liq", "WHP", "WHT", "Choke", "pbh", "Qgl").na.drop().toPandas()
# print(pred_OW1)

In [31]:
from sklearn.linear_model import LinearRegression  # vectorised ML implementation

model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
y_ = model.predict(X_train)

In [32]:
print(model.coef_)

In [33]:
train_plot = np.array(sorted(list(zip(date_train, y_train)))).transpose()
train_plot1 = np.array(sorted(list(zip(date_train, y_)))).transpose()

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(train_plot[0], train_plot[1], "*--", label = 'truth')
ax.plot(train_plot1[0], train_plot1[1], "*--", label = 'model')
ax.legend(loc='best')
ax.grid(True)
display(fig)

In [34]:
y_pred = model.predict(X_val)

In [35]:
train_plot = np.array(sorted(list(zip(date_val, y_val)))).transpose()
train_plot1 = np.array(sorted(list(zip(date_val, y_pred)))).transpose()

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(train_plot[0], train_plot[1], "*--", label = 'truth')
ax.plot(train_plot1[0], train_plot1[1], "*--", label = 'model')
ax.legend(loc='best')
ax.grid(True)
display(fig)

In [36]:
OW1.keys()

In [37]:
display(OW1["LR"])

In [38]:
# threshold out values
test_thresh = DataEng.threshold(test_OW1, 0.8)

# normalise the data
orig_heads = test_thresh.schema.names # schema names gets updated dynamically! So must be separate from the loop

for head in orig_heads:
  if head == "datetime" or head == "WELLNAME" or head == "value": continue
  mean, std = test_thresh.select(F.mean(head), F.stddev(head)).first()
  test_thresh = test_thresh.withColumn("{}_".format(head), (F.col(head)-mean)/std)
  test_thresh = test_thresh.drop(head)
  test_thresh = test_thresh.withColumnRenamed("{}_".format(head), head)

pred_OW1 = test_thresh.select("datetime", "value", "WHP", "WHT", "Choke", "pbh", "Qgl").na.drop().toPandas()
X = pred_OW1[["WHP", "WHT", "Choke", "pbh", "Qgl"]].values
y = np.squeeze(pred_OW1[["value"]].values)

date = np.squeeze(pred_OW1[["datetime"]].values)

model = LinearRegression(fit_intercept=True)
model.fit(X, y)
y_ = model.predict(X_train)

In [39]:
print(model.coef_)

In [40]:
Prod_Qliq_Scale = OW1["LR"].where((OW1["LR"].datetime >= '2018-06-01')).withColumnRenamed("value", "Qliq")
avg_val = DataEng.avg_over_period(OW1["WHP"].where((OW1["WHP"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("WHP")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["WHT"].where((OW1["WHT"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("WHT")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["Choke"].where((OW1["Choke"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("Choke")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["DHP"].where((OW1["DHP"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("DHP")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["GLR"].where((OW1["GLR"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("GLR")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)


In [41]:
display(Prod_Qliq_Scale)

In [42]:
# threshold out values
Prod_Qliq_Scale = Prod_Qliq_Scale.withColumn("value", Prod_Qliq_Scale["Qliq"])
Prod_Qliq_Scale = DataEng.threshold(Prod_Qliq_Scale, 0.8)

# normalise the data
orig_heads = Prod_Qliq_Scale.schema.names # schema names gets updated dynamically! So must be separate from the loop

for head in orig_heads:
  if head == "datetime" or head == "WELLNAME" or head == "value": continue
  print("Standardising: ", head)
  mean, std = Prod_Qliq_Scale.select(F.mean(head), F.stddev(head)).first()
  Prod_Qliq_Scale = Prod_Qliq_Scale.withColumn("{}_".format(head), (F.col(head)-mean)/std)
  Prod_Qliq_Scale = Prod_Qliq_Scale.drop(head)
  Prod_Qliq_Scale = Prod_Qliq_Scale.withColumnRenamed("{}_".format(head), head)


In [43]:
pred_OW1 = Prod_Qliq_Scale.na.drop().toPandas()

pred_OW1 = pred_OW1.set_index(pred_OW1['datetime'])
pred_OW1 = pred_OW1.sort_index()

X = pred_OW1[["WHP", "WHT", "Choke", "DHP", "GLR"]].values
y = np.squeeze(pred_OW1[["value"]].values)

In [44]:
y_pred = model.predict(X)

In [45]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(pred_OW1['datetime'], pred_OW1['value'], "*--", label = 'truth')
ax.plot(pred_OW1['datetime'], y_pred, "*--", label = 'model')
ax.legend(loc='best')
ax.grid(True)
display(fig)

In [46]:
Prod_Qliq_Scale = OW1["LR"].where((OW1["LR"].datetime >= '2018-06-01')).withColumnRenamed("value", "Qliq")
avg_val = DataEng.avg_over_period(OW1["WHP"].where((OW1["WHP"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("WHP")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["WHT"].where((OW1["WHT"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("WHT")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["Choke"].where((OW1["Choke"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("Choke")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["DHP"].where((OW1["DHP"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("DHP")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)

avg_val = DataEng.avg_over_period(OW1["GLR"].where((OW1["GLR"].datetime >= '2018-06-01')), "day")
Prod_Qliq_Scale = Prod_Qliq_Scale.join(avg_val.select(avg_val.datetime, avg_val.value.alias("GLR")), Prod_Qliq_Scale.datetime == avg_val.datetime, how='left').drop(avg_val.datetime)


In [47]:
# threshold out values
Prod_Qliq_Scale = Prod_Qliq_Scale.withColumn("value", Prod_Qliq_Scale["Qliq"])
Prod_Qliq_Scale = DataEng.threshold(Prod_Qliq_Scale, 0.8)

# normalise the data
orig_heads = Prod_Qliq_Scale.schema.names # schema names gets updated dynamically! So must be separate from the loop

for head in orig_heads:
  if head == "datetime" or head == "WELLNAME" or head == "value": continue
  print("Standardising: ", head)
  mean, std = Prod_Qliq_Scale.select(F.mean(head), F.stddev(head)).first()
  Prod_Qliq_Scale = Prod_Qliq_Scale.withColumn("{}_".format(head), (F.col(head)-mean)/std)
  Prod_Qliq_Scale = Prod_Qliq_Scale.drop(head)
  Prod_Qliq_Scale = Prod_Qliq_Scale.withColumnRenamed("{}_".format(head), head)

pred_OW1 = Prod_Qliq_Scale.select("datetime", "value", "WHP", "WHT", "Choke", "DHP", "GLR").na.drop().toPandas()
pred_OW1 = pred_OW1.set_index(pred_OW1['datetime'])
pred_OW1 = pred_OW1.sort_index()

X = pred_OW1[["WHP", "WHT", "Choke", "DHP", "GLR"]].values
y = np.squeeze(pred_OW1[["value"]].values)

date = np.squeeze(pred_OW1[["datetime"]].values)

model = LinearRegression(fit_intercept=True)
model.fit(X, y)
y_ = model.predict(X)

print(model.coef_)

In [48]:
separator = test_thresh.select("datetime", "value", "WHP", "WHT", "Choke", "pbh", "Qgl").na.drop().toPandas()
separator = separator.where(separator["datetime"] >= "2018-06-01")
y_separator = np.squeeze(separator[["value"]].values)
date = np.squeeze(separator[["datetime"]].values)

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(pred_OW1['datetime'], pred_OW1['value'], "--", label = 'MIKON polynomial')
ax.plot(pred_OW1['datetime'], y_, "--", label = 'LR model')
ax.plot(date, y_separator, marker="+", label = "test separator")
ax.legend(loc='best')
ax.grid(True)
display(fig)