# Initial model selection

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import jaccard_score

In [11]:
def flatten(container):
    "Flattens lists"
    for i in container:
        if isinstance(i, (list, tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

def feature_list(no_of_neighbors):
    """
    Creates a list of features given number of adjacent wells
    param no_of_neighbors: number of adjacent wells used in feature engineering
    """
    print("Getting the features")
    initial = ["thickness", "thickness natural log", "thickness power"]
    features = []
    for item in initial:
        features.append(item)
        for i in range(1, no_of_neighbors + 1):
            features.append(item + " neighbor " + str(i))
    features.append(["x location", "y location", "class"])
    return list(flatten(features))

### read in the synthetic training data and select a subset with the wells in the local vicinity

In [12]:
dataset = pd.read_csv(r'399neighbors.csv', index_col=[0])

# Set number of wells in vicinity
wells_in_vicinity = 0
flat_features = feature_list(wells_in_vicinity)

subset = dataset[flat_features]

Getting the features


### split into train and test subsets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
        subset.drop("class", axis=1), subset["class"], test_size=0.2, random_state=86,
    )

### train and score the SVM classifier

In [14]:
# SVM
svmclf = svm.SVC()
svmclf.fit(X_train, y_train)
y_pred = svmclf.predict(X_test)
weighted_jc_score = jaccard_score(y_test, y_pred, average='weighted')
print(f'Accuracy for each class is {jaccard_score(y_test, y_pred, average=None)}')
print(f'Average weighted accuracy is {weighted_jc_score:.2f}')

Accuracy for each class is [0.75636212 0.48589126 0.30998249]
Average weighted accuracy is 0.52


### train and score the decision tree classifier

In [15]:
# Decision Tree
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)
y_pred = dtclf.predict(X_test)
weighted_jc_score = jaccard_score(y_test, y_pred, average='weighted')
print(f'Accuracy for each class is {jaccard_score(y_test, y_pred, average=None)}')
print(f'Average weighted accuracy is {weighted_jc_score:.2f}')

Accuracy for each class is [0.92240493 0.72088966 0.73125565]
Average weighted accuracy is 0.79


### train and score the random forest

In [16]:
# Random Forest
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
y_pred = rfclf.predict(X_test)
weighted_jc_score = jaccard_score(y_test, y_pred, average='weighted')
print(f'Accuracy for each class is {jaccard_score(y_test, y_pred, average=None)}')
print(f'Average weighted accuracy is {weighted_jc_score:.2f}')

Accuracy for each class is [0.93062693 0.76422018 0.77320054]
Average weighted accuracy is 0.82


### train and score adaboost

In [17]:
# AdaBoost
abclf = AdaBoostClassifier()
abclf.fit(X_train, y_train)
y_pred = abclf.predict(X_test)
weighted_jc_score = jaccard_score(y_test, y_pred, average='weighted')
print(f'Accuracy for each class is {jaccard_score(y_test, y_pred, average=None)}')
print(f'Average weighted accuracy is {weighted_jc_score:.2f}')

Accuracy for each class is [0.7580574  0.36306672 0.39484199]
Average weighted accuracy is 0.50


### train and score the KNN classifier

In [18]:
# KNN
knclf = KNeighborsClassifier()
knclf.fit(X_train, y_train)
y_pred = knclf.predict(X_test)
weighted_jc_score = jaccard_score(y_test, y_pred, average='weighted')
print(f'Accuracy for each class is {jaccard_score(y_test, y_pred, average=None)}')
print(f'Average weighted accuracy is {weighted_jc_score:.2f}')

Accuracy for each class is [0.72285464 0.58974359 0.53508412]
Average weighted accuracy is 0.61
