# This is for creating the training data with different numbers of adjacent wells
It will write 22 `csv` files to the directory for the varying numbers of adjacent wells. Similar to `01_training_data_generation.ipynb` but with more adjacent wells in the feature set.

These 300 well dataset can be downloaded from https://osf.io/a6cwh/ inside the `Training Datasets` folder

In [None]:
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

Below we define functions that build the dataset. Functions are descriptively named and have full documentation for each function

In [None]:
def flatten(container):
    "Flattens lists"
    for i in container:
        if isinstance(i, (list, tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

def feature_list(no_of_neighbors):
    """
    Creates a list of features given number of adjacent wells
    param no_of_neighbors: number of adjacent wells used in feature engineering
    """
    print("Getting the features")
    initial = ["thickness", "thickness natural log", "thickness power"]
    features = []
    for item in initial:
        features.append(item)
        for i in range(1, no_of_neighbors + 1):
            features.append(item + " neighbor " + str(i))
    features.append(["x location", "y location", "class"])
    return list(flatten(features))

def data_normalization(dataframe):
    """
    Normalizes the generated stratigraphic data
    param dataframe: a pandas dataframe of thicknesses that has been through feature engineering
    param location: a pandas dataframe with locations 
    """
    print("normalizing")
    normalized_df = (dataframe - dataframe.min()) / (dataframe.max() - dataframe.min()).replace(0, 0.00001)
    return normalized_df
    
def rotation(dataframe, j):
    """
    Creates spatial samples and rotates them in the xy plane
    param dataframe: dataframe output from stratigraphy generation
    """
    x = np.arange(0, 40, 0.1)
    y = np.random.randint(0, 10, len(x))
    # this is the rotation of the generated data
    if j % 0.2 > 0.1:
        dataframe["ex"] = x * np.cos(-j / 2) - y * np.sin(-j / 2)
        dataframe["ey"] = y * np.cos(-j / 2) - x * np.sin(-j / 2)
    else:
        dataframe["ex"] = x * np.cos(j / 2) - y * np.sin(j / 2)
        dataframe["ey"] = y * np.cos(j / 2) - x * np.sin(j / 2)
    return dataframe

def missing(dataframe, number_of_layers):
    """
    Inserts missing data into the dataset at random
    param dataframe: the dataframe output from the rotation function
    param number_of_layers: number of layers to evaluate
    """
    for k in range(100):
        dataframe.iloc[
            np.random.randint(0, 399), np.random.randint(0, number_of_layers - 1),
        ] = 0
    return dataframe

        
def adjacent_wells(dataframe, no_of_neighbors):
    """
    Calculates the adjacent wells and builds the initial dataframe
    param dataframe: dataframe output from the missing function
    param no_of_neighbors: number of adjacent wells used in feature engineering
    """
    hood = squareform(pdist(dataframe.iloc[:, -2:]))
    neighbors = []
    for i in enumerate(hood.argsort()[0:, 1 : no_of_neighbors + 1]):
        selected = (
            dataframe.iloc[hood.argsort()[i[0], 1 : no_of_neighbors + 1], 0:-2]
            .stack()
            .to_frame()
            .T
        )
        selected.columns = selected.columns.droplevel()
        neighbors.append(selected)
    frame = pd.concat(neighbors, sort=False)
    frame.index = range(len(frame))
    neighborhood = pd.concat([dataframe.iloc[:, :-2], frame], axis=1)
    return neighborhood
    
def depth_to_thickness(neighborhood, dataframe):
    """
    Converts the depth dataframe from the adjacent wells function to thicknesses
    param neighborhood: dataframe output from `adjacent_wells`
    param dataframe: dataframe output from function `missing`
    """
    locations = pd.DataFrame()
    df = pd.DataFrame()
    thicknesses = neighborhood.diff(axis=1)
    thicknesses[thicknesses < 0] = 0
    thicknesses.drop(columns="zero", inplace=True)
    locations = pd.concat((locations, dataframe.iloc[:, -2:]))
    df = pd.concat((df, thicknesses))
    return df, locations
  
    
def truncation(smallest, largest, step, names, number_of_layers, j):
    """
    Creates truncated stratal geometries using a min, max, step, names and numbers of layers
    param smallest: the smallest integer value for stratigraphy
    param largest: the largest integer value for stratigraphy
    param step: the size of the step from smallest to largest
    param names: names of the layers as strings in a list
    param number_of_layers: number of layers to evaluate
    param j: float value 
    """
    rolling = pd.DataFrame()
    j = np.round(j, decimals=3) + 0.5
    elevation_random = sorted(
        np.random.uniform(smallest, largest, number_of_layers - 1)
    )
    for i in range(len(names[0 : number_of_layers - 1])):

        basement = (
            0.001
            + (10) * np.sin(1 - np.arange(0, 40, 0.1) / (j * 2) + 0.001)
            + np.random.rand(400) / 5
        )
        elevation = (
            np.full(
                400,
                basement.max()
                + np.random.uniform(basement.min() / 2, basement.max() / 64, 1),
            )
            + np.random.rand(400) / 5
        )
        topbasement = np.where(basement > elevation, elevation, basement)

        rolling["zero"] = topbasement
        layer_elevation = (
            0.001
            + (10) * np.sin(1 - np.arange(0, 40, 0.1) / (j * 2) + 0.001)
            + abs(elevation_random[i])
            + np.random.rand(400) / 5
        )
        layer_elevation = np.where(
            layer_elevation < basement, basement, layer_elevation
        )
        layer_elevation = np.where(
            layer_elevation > elevation, elevation, layer_elevation
        )
        rolling[names[i]] = layer_elevation
    return rolling

def onlap(smallest, largest, step, names, number_of_layers, j):
    """
    Creates onlap stratal geometries using a min, max, step, names and numbers of layers
    param smallest: the smallest integer value for stratigraphy
    param largest: the largest integer value for stratigraphy
    param step: the size of the step from smallest to largest
    param names: names of the layers as strings in a list
    param number_of_layers: number of layers to evaluate
    param j: float value 
    """
    rolling = pd.DataFrame()
    j = np.round(j, decimals=3) + 0.5
    elevation_random = sorted(
        np.random.uniform(smallest, largest, number_of_layers - 1)
    )
    for i in range(len(names[0 : number_of_layers - 1])):
        basement = (
            0.001
            + (10) * np.sin(1 - np.arange(0, 40, 0.1) / (j * 2) + 0.001)
            + np.random.rand(400) / 5
        )
        elevation = (
            np.full(
                400,
                basement.max()
                + np.random.uniform(basement.min() / 2, basement.max() / 64, 1),
            )
            + np.random.rand(400) / 5
        )
        topbasement = np.where(basement > elevation, elevation, basement)
        rolling["zero"] = topbasement
        strat_elevation = (
            np.full(400, elevation_random[i]) + np.random.rand(400) / 5
        )
        onlap = np.where(strat_elevation > basement, strat_elevation, basement)
        layer_elevation = np.where(onlap > elevation, elevation, onlap)
        rolling[names[i]] = layer_elevation
    return rolling

def horizontal(smallest, largest, step, names, number_of_layers, j):
    """
    Creates onlap stratal geometries using a min, max, step, names and numbers of layers
    param smallest: the smallest integer value for stratigraphy
    param largest: the largest integer value for stratigraphy
    param step: the size of the step from smallest to largest
    param names: names of the layers as strings in a list
    param number_of_layers: number of layers to evaluate
    param j: float value 
    """
    rolling = pd.DataFrame()
    j = j + 0.5
    elevation_random = sorted(
        np.random.uniform(smallest, largest, number_of_layers - 1)
    )
    for i in range(len(names[0 : number_of_layers - 1])):
        strat_elevation = (
            np.full(400, elevation_random[i]) + np.random.rand(400) / 5
        )
        basement = strat_elevation - abs(
            np.random.uniform(smallest, largest, number_of_layers - 1)
            + np.random.rand(400) / 5
        )
        elevation = (
            np.full(400, strat_elevation + elevation_random[i])
            + np.random.rand(400) / 5
        )
        topbasement = np.where(basement > elevation, elevation, basement)
        layer_elevation = np.where(
            strat_elevation > elevation, elevation, strat_elevation
        )
        rolling["zero"] = topbasement
        rolling[names[i]] = layer_elevation
    return rolling

def build_feature_engineered_dataset(thickness_df, locations_df):
    """
    Takes the generated thickness dataset and runs feature engineering
    param thickness_df: the generated thickness dataset
    param locations_df: the generated locations dataset    
    """
    log_transform = FunctionTransformer(np.log, validate=False)
    power_transform = FunctionTransformer(lambda x: x ** 10, validate=False)
    
    logged = pd.DataFrame(log_transform.transform(thickness_df))
    powered = pd.DataFrame(power_transform.transform(thickness_df)) 
    feature_dataset = (
        pd.concat([thickness_df, logged, powered, locations_df], axis=1)
        .dropna()
        .replace(-np.inf, 0)
    )
    return feature_dataset
  

Next we define some global variables to build the dataset

In [None]:
NAMES = ["one", "two", "three"]  # this creates dummy NAMES for the formations
NUMBER_OF_LAYERS = 2  # this is the number of tops you want in your training data
SMALLEST = -6
LARGEST = 12
STEP = 2
# how many adjacent wells to use for feature engineering
NEIGHBORS_TO_TEST = [
    0,
    1,
    2,
    3,
    4,
    5,
    10,
    15,
    20,
    25,
    35,
    40,
    50,
    60,
    75,
    85,
    95,
    125,
    150,
    200,
    300,
    399,
]

Last we run through the different number of neighbors to generate for each dataset and save them to disk

In [None]:
for i in NEIGHBORS_TO_TEST:

    no_of_neighbors = i

    np.random.seed(19)
    df = pd.DataFrame()
    df_onlap = pd.DataFrame()
    locations = pd.DataFrame()
    elevation_random = sorted(
        np.random.uniform(SMALLEST, LARGEST, NUMBER_OF_LAYERS - 1)
    )
    flat_features = feature_list(no_of_neighbors)
    print(f"STARTING with {no_of_neighbors} neighbor")
    trunc_master = pd.DataFrame()
    onlap_master = pd.DataFrame()
    horiz_master = pd.DataFrame()
    
    for j in np.arange(SMALLEST, LARGEST, STEP):
        # first create the truncated dataset
        trunc_data = truncation(SMALLEST, LARGEST, STEP, NAMES, NUMBER_OF_LAYERS, j)
        trunc_rotated = rotation(trunc_data, j)
        trunc_missing = missing(trunc_rotated, NUMBER_OF_LAYERS)
        trunc_depth = adjacent_wells(trunc_missing, no_of_neighbors)
        trunc_thickness, trunc_locations = depth_to_thickness(trunc_depth, trunc_missing)
        
        # next make the onlap dataset
        onlap_data = onlap(SMALLEST, LARGEST, STEP, NAMES, NUMBER_OF_LAYERS, j)
        onlap_rotated = rotation(onlap_data, j)
        onlap_missing = missing(onlap_rotated, NUMBER_OF_LAYERS)
        onlap_depth = adjacent_wells(onlap_missing, no_of_neighbors)
        onlap_thickness, onlap_locations = depth_to_thickness(onlap_depth, onlap_missing)
        
        # last make the horizontal dataset
        horiz_data = horizontal(SMALLEST, LARGEST, STEP, NAMES, NUMBER_OF_LAYERS, j)
        horiz_rotated = rotation(horiz_data, j)
        horiz_missing = missing(horiz_rotated, NUMBER_OF_LAYERS)
        horiz_depth = adjacent_wells(horiz_missing, no_of_neighbors)
        horiz_thickness, horiz_locations = depth_to_thickness(horiz_depth, horiz_missing)

        # Now let's run the feature engineering
        trunc_non_normal = build_feature_engineered_dataset(trunc_thickness, trunc_locations)
        onlap_non_normal = build_feature_engineered_dataset(onlap_thickness, onlap_locations)
        horiz_non_normal = build_feature_engineered_dataset(horiz_thickness, horiz_locations)
        
        # Finally we need to normalize the data with our own MinMaxScaler
        trunc_normal = data_normalization(trunc_non_normal)
        onlap_normal = data_normalization(onlap_non_normal)
        horiz_normal = data_normalization(horiz_non_normal)
        
        #Then add class labels
        trunc_normal["class"] = 'truncation'  # truncation
        onlap_normal["class"] = 'onlap'  # onlap
        horiz_normal["class"] = 'horizontal'  # horizontal
        
        trunc_master = pd.concat((trunc_master, trunc_normal))
        onlap_master = pd.concat((onlap_master, onlap_normal))
        horiz_master = pd.concat((horiz_master, horiz_normal))
        
    dataset = pd.concat((trunc_master, onlap_master, horiz_master))
    dataset.columns = flat_features
    print(f"saving the training data for {no_of_neighbors}")
    #master_df.to_csv(str(no_of_neighbors) + "neighbors.csv")
    print(f"Done with {no_of_neighbors} neighbors")