In [193]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import pearsonr

#reading in the data file
wellData = pd.read_csv("Data/well production.csv")
wellData.head()

Unnamed: 0,well name,average pressure (Pa),recovery factor,formation volume factor,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,...,water 3,water 4,water 5,water 6,water 7,water 8,water 9,water 10,water 11,water 12
0,Peak 6-217H,35352874,0.092554,1.6,862.0,824.0,759.0,728.0,661.0,601.0,...,180.0,211.0,277.0,338.0,397.0,445.0,547.0,562.0,639.0,668.0
1,Tarragon 4-119H,34882173,0.107706,1.6,228.0,249.0,214.0,210.0,161.0,163.0,...,22.0,27.0,75.0,74.0,59.0,90.0,124.0,119.0,126.0,157.0
2,Fennel 10-129H,36064538,0.07915,1.6,67.0,85.0,73.0,73.0,57.0,58.0,...,15.0,15.0,31.0,30.0,33.0,31.0,20.0,49.0,30.0,41.0
3,Federal 14-113H,35817881,0.103748,1.6,256.0,242.0,267.0,263.0,199.0,191.0,...,9.0,13.0,78.0,86.0,119.0,134.0,139.0,162.0,136.0,183.0
4,King 7-184H,38442406,0.084675,1.6,23.0,29.0,31.0,50.0,72.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0


In [194]:
# general dataframe with information about all wells
genData = pd.DataFrame(columns = ['name', 'length', 'stageCount', 'propWeight', 
                                  'pumpRate', 'oilProd'])

# for every well logged
for i in range(len(wellData.index)):
    name = wellData.iloc[i,0]
    # creating file name to be read
    fileName = f"Data/{name}.csv"
    tempDF = pd.read_csv(fileName)
    # storing csv of current well
    tempDF.columns = ['east', 'north', 'por', 'perm', 'poisson', 'young', 'waterSat', 
                      'oilSat', 'thick', 'propWeight', 'pumpRate']
    # finds total well length
    wellLen = tempDF.iloc[len(tempDF.index) - 1, 0] - tempDF.iloc[0, 0]
    # number of stages, used for averages
    stageCount = tempDF['propWeight'].count()
    # calculates average proppant weight per stage
    avgProp = tempDF['propWeight'].sum() / stageCount
    # calculates average pump rate for stage
    avgPumpRate = tempDF['pumpRate'].sum() / stageCount
    # calculates total yearly well production
    prod = wellData.iloc[i, 4 : 16].sum()
    #appends well-specific information into general dataframe for plotting
    genData = genData.append(pd.Series([name, wellLen, stageCount, avgProp, avgPumpRate, prod], 
                             index = genData.columns), ignore_index = True)

In [204]:
def scatter(df, xCol, yCol):
    x = df[xCol]
    y = df[yCol]
    
    fig, ax = plt.subplots(figsize = (10, 5))
    ax.scatter(x, y)
    ax.set_title(f"{yCol} vs {xCol}")
    
    ax.set_xlabel(xCol)
    ax.set_ylabel(yCol)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.show()
    
def pow(list, power):
    return [x**power for x in list]

    

In [209]:
oil = 'oilProd'
for name in genData[['length', 'stageCount', 'propWeight', 'pumpRate']]:
    #scatter(genData, name, oil)
    currList = genData[name]
    for i in range(-6,6):
        newList = pow(currList, i)
        linCo = pearsonr(newList, genData[oil])[0]
        if linCo > .6:
            print(f"{name} to the power of {i}")
            print(linCo)
    pearson = pearsonr(genData[name], genData[oil])
    spearman = spearmanr(genData[name], genData[oil])
    #print(pearson[0])
    #print(spearman)
    #print(f"{name}{pearson}")

    

stageCount to the power of 1
0.6564443734959791
stageCount to the power of 2
0.6633735776609535
stageCount to the power of 3
0.643127022481601
stageCount to the power of 4
0.6268549260623296
stageCount to the power of 5
0.617601149132221
