In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

sns.set()

In [None]:
%matplotlib inline

In [None]:
def flatten(container):
    "Flattens lists"
    for i in container:
        if isinstance(i, (list, tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i

def feature_list(no_of_neighbors):
    """
    Creates a list of features given number of adjacent wells
    param no_of_neighbors: number of adjacent wells used in feature engineering
    """
    print("Getting the features")
    initial = ["thickness", "thickness natural log", "thickness power"]
    features = []
    for item in initial:
        features.append(item)
        for i in range(1, no_of_neighbors + 1):
            features.append(item + " neighbor " + str(i))
    features.append(["x location", "y location", "class"])
    return list(flatten(features))

In [None]:
TRAINING_DATA = pd.read_csv(r'399neighbors.csv', index_col=[0])

In [None]:
ACCURACY_MEASURED = []  # for the cross-validation accuracy
VICINITY = []  # the number of adjacent wells
grid_params = {
    'bootstrap': [True, False],
    'max_depth': [1, 10, 100],
    'max_features': [1, 100, 500],
    'min_samples_leaf': [10, 100, 1000],
    'min_samples_split': [10, 100, 1000],
    'n_estimators': [10, 100, 1000],
    'criterion': ['gini', 'entropy']
}


for i in range(400):
    print(f'{i} number of wells in vicinity')
    wells_in_vicinity = i
    flat_features = feature_list(wells_in_vicinity)
    subset = TRAINING_DATA[flat_features]
    le = preprocessing.LabelEncoder()
    le_class = le.fit_transform(subset['class'])
    subset.loc[:,'le_class'] =  le_class
    subset.drop('class', inplace=True, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(
        subset.drop("le_class", axis=1), subset["le_class"], test_size=0.2, random_state=86,
    )
    gs = GridSearchCV(RandomForestClassifier(), grid_params, verbose=8, cv=5, n_jobs=7)
    gs_results = gs.fit(X_train, y_train)
    rtclf = RandomForestClassifier(**gs.best_params_)

    cved = cross_val_score(
        rtclf,
        #dataset.drop("class", axis=1),
        #dataset["class"],
        X_test,
        y_test,
        cv=5,
        scoring="accuracy",
    )
    ACCURACY_MEASURED.append(cved)
    VICINITY.append(wells_in_vicinity)

In [None]:
MEAN_ACCURACY = []
for i in enumerate(ACCURACY_MEASURED):
    plt.plot(ACCURACY_MEASURED[i[0]], label=str(VICINITY[i[0]]) + " Neighbors")
    MEAN_ACCURACY.append(ACCURACY_MEASURED[i[0]].mean().round(4))
    plt.legend()
plt.xlabel("Fold Number")
plt.ylabel("Accuracy")

In [None]:
plt.plot(VICINITY, MEAN_ACCURACY)
plt.xlabel("number of adjacent wells")
plt.ylabel("mean cross-validated accuracy")