In [31]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#reading in the data file
wellData = pd.read_csv("Data/well production.csv")
wellData.head()

Unnamed: 0,well name,average pressure (Pa),recovery factor,formation volume factor,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,...,water 3,water 4,water 5,water 6,water 7,water 8,water 9,water 10,water 11,water 12
0,Peak 6-217H,35352874,0.092554,1.6,862.0,824.0,759.0,728.0,661.0,601.0,...,180.0,211.0,277.0,338.0,397.0,445.0,547.0,562.0,639.0,668.0
1,Tarragon 4-119H,34882173,0.107706,1.6,228.0,249.0,214.0,210.0,161.0,163.0,...,22.0,27.0,75.0,74.0,59.0,90.0,124.0,119.0,126.0,157.0
2,Fennel 10-129H,36064538,0.07915,1.6,67.0,85.0,73.0,73.0,57.0,58.0,...,15.0,15.0,31.0,30.0,33.0,31.0,20.0,49.0,30.0,41.0
3,Federal 14-113H,35817881,0.103748,1.6,256.0,242.0,267.0,263.0,199.0,191.0,...,9.0,13.0,78.0,86.0,119.0,134.0,139.0,162.0,136.0,183.0
4,King 7-184H,38442406,0.084675,1.6,23.0,29.0,31.0,50.0,72.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0


In [32]:
# general dataframe with information about all wells
genData = pd.DataFrame(columns = ['name', 'length', 'stageCount', 'propWeight', 
                                  'pumpRate', 'oilProd', 'easting', 'northing'])

# for every well logged
for i in range(len(wellData.index)):
    name = wellData.iloc[i,0]
    # creating file name to be read
    fileName = f"Data/{name}.csv"
    tempDF = pd.read_csv(fileName)
    # storing csv of current well
    tempDF.columns = ['east', 'north', 'por', 'perm', 'poisson', 'young', 'waterSat', 
                      'oilSat', 'thick', 'propWeight', 'pumpRate']
    # finds total well length
    wellLen = tempDF.iloc[len(tempDF.index) - 1, 0] - tempDF.iloc[0, 0]
    # number of stages, used for averages
    stageCount = tempDF['propWeight'].count()
    # calculates average proppant weight per stage
    avgProp = tempDF['propWeight'].sum() / stageCount
    # calculates average pump rate for stage
    avgPumpRate = tempDF['pumpRate'].sum() / stageCount
    # calculates total yearly well production
    prod = wellData.iloc[i, 4 : 16].sum()
    #calculate locations of wells 
    easting = tempDF['east'].mean()
    northing = tempDF['north'].mean()
    #appends well-specific information into general dataframe for plotting
    genData = genData.append(pd.Series([name, wellLen, stageCount, avgProp, avgPumpRate, prod,
                                       easting, northing], 
                             index = genData.columns), ignore_index = True)

In [33]:
def scatter(df, xCol, yCol):
    x = df[xCol]
    y = df[yCol]
    
    fig, ax = plt.subplots(figsize = (10, 5))
    ax.scatter(x, y)
    ax.set_title(f"{yCol} vs {xCol}")
    
    ax.set_xlabel(xCol)
    ax.set_ylabel(yCol)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.show()
    
def pow(list, power):
    return [x**power for x in list]

    

In [34]:
genData.head()

Unnamed: 0,name,length,stageCount,propWeight,pumpRate,oilProd,easting,northing
0,Peak 6-217H,7565.0,100,910422.941447,294.800448,6808.0,81782.32,1600.0
1,Tarragon 4-119H,9643.0,50,728628.013775,288.362177,1968.0,93221.44,32200.0
2,Fennel 10-129H,10963.0,25,988860.773545,290.68517,736.0,13681.29,200.0
3,Federal 14-113H,10664.0,34,690947.011162,298.807597,2204.0,84831.86,31000.0
4,King 7-184H,7276.0,50,832067.579729,295.943915,490.0,9038.0,79200.0


In [35]:
noName = genData[['length', 'stageCount', 'propWeight', 'pumpRate', 'easting', 'northing']]
y = genData['oilProd'].values
x_train, x_test, y_train, y_test = train_test_split(noName, y, test_size = .01)

n_estimators = [int(x) for x in np.linspace(start=100, stop=900, num=100)]
# the number of features to use at each split
max_features = ["auto", "sqrt"]
# max number of levels in each tree
max_depth = [int(x) for x in np.linspace(10, 220, num=11)]
max_depth.append(None)
# minimum samples needed to split a tree
min_samples_split = [2, 5, 10, 15, 20]
# minimum samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16]
# method for selecting samples
bootstrap = [True, False]

# create the grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

In [36]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=19,
    n_jobs=-1,
)
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   12.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [159]:
def compare(model, test_features, test_actual):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_actual)
    mape = 100 * np.mean(errors / test_actual)
    accuracy = 100 - mape
    print("Model Performance")
    print("Average Error: {:0.4f}".format(np.mean(errors)))
    print("Accuracy = {:0.2f}%.".format(accuracy))

    return accuracy

base_model = RandomForestRegressor(n_estimators=10)
base_model.fit(x_train, y_train)
base_accuracy = compare(base_model, x_train, y_train)

best_random = rf_random.best_estimator_
random_accuracy = compare(best_random, x_train, y_train)

Model Performance
Average Error: 217.0767
Accuracy = 85.80%.
Model Performance
Average Error: 0.0000
Accuracy = 100.00%.


In [160]:
feat_labels = noName.columns.values  # get the feature labels
feature = list(
    zip(feat_labels, best_random.feature_importances_)
)  # make a list of the feature labels and the importance values
sorted(
    feature, key=lambda tup: tup[1], reverse=True
)  # sort from most to least important feature in predicting production

[('stageCount', 0.38513674584359237),
 ('northing', 0.23361960577692542),
 ('easting', 0.19398105915475686),
 ('length', 0.06546742717104917),
 ('pumpRate', 0.062157220201362856),
 ('propWeight', 0.05963794185231324)]

In [161]:
from sklearn.model_selection import cross_val_score, GridSearchCV

noName.drop(['propWeight', 'pumpRate', 'length'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(noName, y, test_size = .1) 
newRf = RandomForestRegressor(n_estimators = 100)
newRf.fit(x_train, y_train)
scores = cross_val_score(newRf, x_test, y_test, cv=10, scoring='neg_mean_absolute_error')
print(scores)

[  -91.4   -275.56  -566.56  -782.92  -558.97 -1873.38 -1344.58  -762.93
 -1148.18 -1195.91]
