# Initial model selection

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def flatten(container):
    "Flattens lists"
    for i in container:
        if isinstance(i, (list, tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

def feature_list(no_of_neighbors):
    """
    Creates a list of features given number of adjacent wells
    param no_of_neighbors: number of adjacent wells used in feature engineering
    """
    print("Getting the features")
    initial = ["thickness", "thickness natural log", "thickness power"]
    features = []
    for item in initial:
        features.append(item)
        for i in range(1, no_of_neighbors + 1):
            features.append(item + " neighbor " + str(i))
    features.append(["x location", "y location", "class"])
    return list(flatten(features))

### read in the synthetic training data and select a subset with the wells in the local vicinity

In [3]:
dataset = pd.read_csv(r'399neighbors.csv', index_col=[0])

# Set number of wells in vicinity
wells_in_vicinity = 10
flat_features = feature_list(wells_in_vicinity)

subset = dataset[flat_features]

Getting the features


### split into train and test subsets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
        subset.drop("class", axis=1), subset["class"], test_size=0.2, random_state=86,
    )

### train and score the SVM classifier

In [5]:
# SVM
svmclf = svm.SVC()
svmclf.fit(X_train, y_train)
svmclf.score(X_test, y_test)

0.8523148148148149

### train and score the decision tree classifier

In [6]:
# Decision Tree
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)
dtclf.score(X_test, y_test)

0.9314814814814815

### train and score the random forest

In [7]:
# Random Forest
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
rfclf.score(X_test, y_test)

0.9842592592592593

### train and score adaboost

In [8]:
# AdaBoost
abclf = AdaBoostClassifier()
abclf.fit(X_train, y_train)
abclf.score(X_test, y_test)

0.6731481481481482

### train and score the KNN classifier

In [9]:
# KNN
knclf = KNeighborsClassifier()
knclf.fit(X_train, y_train)
knclf.score(X_test, y_test)

0.8652777777777778