In [4]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import math
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#reading in the data file
wellData = pd.read_csv("Data/well production.csv")
wellData.head()

Unnamed: 0,well name,average pressure (Pa),recovery factor,formation volume factor,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,...,water 3,water 4,water 5,water 6,water 7,water 8,water 9,water 10,water 11,water 12
0,Peak 6-217H,35352874,0.092554,1.6,862.0,824.0,759.0,728.0,661.0,601.0,...,180.0,211.0,277.0,338.0,397.0,445.0,547.0,562.0,639.0,668.0
1,Tarragon 4-119H,34882173,0.107706,1.6,228.0,249.0,214.0,210.0,161.0,163.0,...,22.0,27.0,75.0,74.0,59.0,90.0,124.0,119.0,126.0,157.0
2,Fennel 10-129H,36064538,0.07915,1.6,67.0,85.0,73.0,73.0,57.0,58.0,...,15.0,15.0,31.0,30.0,33.0,31.0,20.0,49.0,30.0,41.0
3,Federal 14-113H,35817881,0.103748,1.6,256.0,242.0,267.0,263.0,199.0,191.0,...,9.0,13.0,78.0,86.0,119.0,134.0,139.0,162.0,136.0,183.0
4,King 7-184H,38442406,0.084675,1.6,23.0,29.0,31.0,50.0,72.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0


In [12]:
# general dataframe with information about all wells
genData = pd.DataFrame(columns = ['name', 'length', 'stageCount', 'propWeight', 
                                  'pumpRate', 'oilProd', 'easting', 'northing', 'ooip'])

# for every well logged
for i in range(len(wellData.index)):
    name = wellData.iloc[i,0]
    # creating file name to be read
    fileName = f"Data/{name}.csv"
    tempDF = pd.read_csv(fileName)
    # storing csv of current well
    tempDF.columns = ['east', 'north', 'por', 'perm', 'poisson', 'young', 'waterSat', 
                      'oilSat', 'thick', 'propWeight', 'pumpRate']
    # finds total well length
    wellLen = tempDF.iloc[len(tempDF.index) - 1, 0] - tempDF.iloc[0, 0]
    # number of stages, used for averages
    stageCount = tempDF['propWeight'].count()
    # calculates average proppant weight per stage
    avgProp = tempDF['propWeight'].sum() / stageCount
    # calculates average pump rate for stage
    avgPumpRate = tempDF['pumpRate'].sum() / stageCount
    # calculates total yearly well production
    prod = wellData.iloc[i, 4 : 16].sum()
    #calculate locations of wells 
    easting = tempDF['east'].mean()
    northing = tempDF['north'].mean()
    
    # calculate reservoir area
    b = 10
    res_area = wellLen * 2 * b + math.pi * (b ** 2) / 43650
    
    # calculate thickness of reservoir
    avg_thickness = tempDF['thick'].mean()
    
    # calculate avg porosity
    avg_porosity = tempDF['por'].mean()
    
    # calculate avg water saturation
    avg_waterSat = tempDF['waterSat'].mean()
    
    # calculate formation volume factor
    fvf = 1.6
    
    # calculate ooip
    ooip = 7758 * res_area * avg_thickness * avg_porosity * (1 - avg_waterSat) / fvf
    
    
    #appends well-specific information into general dataframe for plotting
    genData = genData.append(pd.Series([name, wellLen, stageCount, avgProp, avgPumpRate, prod,
                                       easting, northing, ooip], 
                             index = genData.columns), ignore_index = True)

In [13]:
genData.head()

Unnamed: 0,name,length,stageCount,propWeight,pumpRate,oilProd,easting,northing,ooip
0,Peak 6-217H,7565.0,100,910422.941447,294.800448,6808.0,81782.32,1600.0,1454513000.0
1,Tarragon 4-119H,9643.0,50,728628.013775,288.362177,1968.0,93221.44,32200.0,1708612000.0
2,Fennel 10-129H,10963.0,25,988860.773545,290.68517,736.0,13681.29,200.0,188249900.0
3,Federal 14-113H,10664.0,34,690947.011162,298.807597,2204.0,84831.86,31000.0,3947821000.0
4,King 7-184H,7276.0,50,832067.579729,295.943915,490.0,9038.0,79200.0,24547210.0


In [14]:
locData = genData[["easting", "northing"]]
locData.head()

Unnamed: 0,easting,northing
0,81782.32,1600.0
1,93221.44,32200.0
2,13681.29,200.0
3,84831.86,31000.0
4,9038.0,79200.0


In [15]:
y = genData['ooip'].values
x_train, x_test, y_train, y_test = train_test_split(locData, y, test_size = .1)

In [16]:
# fit the random forest regressor
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

print(rf.score(x_test, y_test))

0.8338312899392174


In [None]:
# plug in 10 well locations