# **<h1><center> Production EDA </center></h1>**

## Import Packages

In [4]:
import pandas as pd
import utm
import sqlite3
import random
import numpy as np
from statistics import mean 
import plotly.express as px
import plotly.graph_objects as go
pd.set_option("display.max_rows",1000); pd.options.display.precision = 1
import warnings
warnings.filterwarnings("ignore")

### Connect to SQlite3 DataBase

In [5]:
%load_ext sql
%sql sqlite:////Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db

### View Tables

In [15]:
%%sql 

SELECT 
    name 
FROM sqlite_master 
WHERE type ='table';

 * sqlite:////Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db
Done.


name
prod_table
header_table
prod_table_clean
header_table_clean


## Import Production data

In [7]:
query = %sql SELECT UWI,Date,Days,Oil,Water,Gas FROM prod_table

df_production = query.DataFrame()
df_production.set_index('UWI',inplace=True)

 * sqlite:////Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db
Done.


## Prepare Production Data
- Cast `Date` to datetime
- Sort data by date
- Calculate total fluid, WOR, GOR

In [8]:
df_production['Date'] = pd.to_datetime(df_production['Date'], infer_datetime_format=True)
df_production.sort_values('Date',inplace=True)
df_production['Fluid'] = df_production['Oil'] + df_production['Water']
df_production['WOR'] = round(df_production['Water'] / df_production['Oil'],3)
df_production['GOR'] = round(df_production['Gas'] / df_production['Oil'],3)
print(df_production.shape)
df_production.describe().transpose()

(1250870, 8)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Days,1200000.0,26.0,8.9,0.0,27.0,30.0,31.0,31.0
Oil,1200000.0,3026.7,4615.6,-1.0,761.0,1588.0,3269.0,140000.0
Water,1200000.0,2805.4,5373.5,-53.0,498.0,1297.0,2883.0,650000.0
Gas,1200000.0,4938.2,8549.1,0.0,928.0,2275.0,4986.0,200000.0
Fluid,1200000.0,5832.1,9095.8,-53.0,1575.0,3113.0,6228.0,650000.0
WOR,1200000.0,,,-inf,0.4,0.8,1.5,inf
GOR,1200000.0,inf,,0.0,0.9,1.4,2.2,inf


## Create Clean Production Table

In [10]:
cnx = sqlite3.connect(r"/Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db")
df_production.to_sql("prod_table_clean", cnx, if_exists='replace')

## Import Header data

In [11]:
query = %sql SELECT * FROM header_table
df_header = query.DataFrame()
df_header.set_index('UWI',inplace=True)

 * sqlite:////Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db
Done.


### Get utm X and Y for lat / long for each well

In [12]:
for uwi in df_header.index.tolist():
    x,y,z,z = utm.from_latlon(df_header.loc[uwi,'Latitude'], df_header.loc[uwi,'Longitude'])
    df_header.loc[uwi,'SHL_X'] = x
    df_header.loc[uwi,'SHL_Y'] = y
print('Done',df_header[['Latitude','Longitude','SHL_X','SHL_Y']].sample(5))

Done                 Latitude  Longitude     SHL_X    SHL_Y
UWI                                                   
33053066570000      47.7     -103.0  653080.9  5.3e+06
33053028850000      47.8     -102.8  665439.6  5.3e+06
33105028570000      48.3     -103.6  606529.9  5.3e+06
33105026870000      48.3     -103.0  650134.8  5.4e+06
33061033100000      48.2     -102.7  673102.2  5.3e+06


### Feature Manipulation and Cleaning
- Cast Completion and Spud date to datetime
- Calculate vintage year
- Get top perf and bottom perf
- Get Perforated Length
- Get Township, Range, Section, and Block

In [13]:
df_header[['Completion_Date','Spud_Date']] = df_header[['Completion_Date','Spud_Date']].apply(pd.to_datetime, errors='coerce')
df_header['Vintage_Year'] = pd.DatetimeIndex(df_header['Completion_Date']).year
# Get top and bottom perf
df_header[['Top_Perf','Bottom_Perf']] = df_header['Perf_Interval'].str.split('-',n=1,expand=True)
df_header['Top_Perf'] = df_header['Top_Perf'].str.replace('[^0-9]','').replace('',np.nan)
df_header['Bottom_Perf'] = df_header['Bottom_Perf'].str.replace('[^0-9]','').replace('',np.nan)
df_header[['Operator','delete']] = df_header['Current_Operator'].str.split(" ",n=1,expand=True).replace(',','')

# Clean
df_header[['Top_Perf','Bottom_Perf']] = df_header[['Top_Perf','Bottom_Perf']].astype(float)
df_header = df_header[df_header['Top_Perf'].between(5000,13000)]

# Get perforated length
df_header['Perforated_Length'] = df_header['Bottom_Perf'] - df_header['Top_Perf']

# Get the Township/Range/Section Identifiers
df_header[['Direction','TRS']] = df_header['Location'].str.split(" ",n=1,expand=True)
df_header[['Section','Township','Range']] = df_header['TRS'].str.split("-",n=2,expand=True)
df_header['Block'] = (df_header['Township'] + df_header['Range']).astype(int)

In [14]:
cnx = sqlite3.connect(r"/Users/john.odonnell/Python/Web_Scraping/NDIC/Well_DataBase.db")
df_header.to_sql("header_table_clean",cnx, if_exists='replace')