# Export/Import script for SQL Server to MySQL
From 2018-current, CalGEM has changed the database structure, reporting style and some of the nomenclature. This script is attempting to convert the well and monthly data into a composite format that can be used with historical data

In [111]:
# import os.path

import pyodbc
#import mysql.connector
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm # as of 2/1/2022
from ipywidgets import FloatProgress
from IPython.display import display

pd.options.display.max_columns = 999

In [112]:
# Progress bar
progress = FloatProgress(min=0.0, max=100.0)
display(progress)

FloatProgress(value=0.0)

In [113]:
# Pandas functions progress bar
tqdm.pandas()


## Loading the Data
There are 2 data tables provided by the regulatory body of interest to me
The Wells table, which now has API12 (so wellbore) and all the pertinent annual codes and attributes.
Then the monthly volumes tables (actually 2, one for production and one for injection, but I've already joined them in a query)

In [114]:
load_year = 2017

# these are a different set up than the 2018-current databases, which are SQL Server BAK files.
# The 2015-2017 data came as Access files, and I linked them to a SQL Server db so easier access
# under the 'WellProductionInjectionLegacy' database. They also contain a PWT__ID index which greatly
# simplifies the linking of well records to monthly volumes.
sql_query_filename = f"C:/Stuff/doggr_2024/SQLQuery_{load_year}_Legacy_Prod_Inj.sql"
mssql_db_name = f"WellProductionInjectionLegacy"
mssql_tbl_name = f"{load_year}CaliforniaOilAndGasWells"


In [115]:
# check if the pickle data file exists
# it can take minutes to read the SQL data into a dataframe, so pickling it will save a lot time on a restart of the notebook
# note the pickle file is saved in the current working directory, not the sql directory mentioned below (obv)
pickle_file_name = f"{load_year}_prod_inj_data.pkl"
if os.path.exists(pickle_file_name) :
    print("Reading existing mv pickle...")
    df = pd.read_pickle(pickle_file_name)
else:
    # Connect to the SQL Server to read the data tables
    # Starting in v18, they default encryption, which we definately don't need for this and locally
    # ;Database=WellProductionInjection2019
    conn_str = "Driver={ODBC Driver 18 for Sql Server};Server=WhiteFractal-i7\\SQLEXPRESS;Trusted_Connection=yes;Encrypt=no;"
    conn = pyodbc.connect(conn_str)

    print("Reading SQL monthly volume data...")
    with open(sql_query_filename) as sqlfile:
        query_result = sqlfile.read()

    df = pd.read_sql_query(query_result, conn)

    # Some of the INT types in the query return here as Float b/c the Pandas default NULL is a float type.
    # So I'll convert those columns back to Int64 (which also includes a NULL option)
    df['casing_psi'] = df['casing_psi'].astype('Int64')
    df['tubing_psi'] = df['tubing_psi'].astype('Int64')
    df['gas_btu']    = df['gas_btu'].astype('Int64')
    df['well_mo']    = df['well_mo'].astype('Int64')
    df['surf_inj_press_psi'] = df['surf_inj_press_psi'].astype('Int64')
    df['water_disposition'] = df['water_disposition'].astype('Int64')
    df['water_source'] = df['water_source'].astype('Int64')
    df['water_kind'] = df['water_kind'].astype('Int64')

    # broken API number
    df['api_no'].replace('04053-223300', '040532233800', inplace=True)

    df.to_pickle(pickle_file_name)

progress.value += 5
# print out the loaded dataframe
df


Reading SQL monthly volume data...


  df = pd.read_sql_query(query_result, conn)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['api_no'].replace('04053-223300', '040532233800', inplace=True)


Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID
0,040010000100,40010000100,2017-01-01,1,2017,00,Active,0,OG,A,10,50,1,4,0.0,5,9.0,31.0,1.0,0.0,,,,,,,404,00,00,Reported,1,100097216
1,040010000100,40010000100,2017-02-01,2,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,9.0,28.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216
2,040010000100,40010000100,2017-03-01,3,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,3.0,7.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216
3,040010000100,40010000100,2017-04-01,4,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,2.0,20.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216
4,040010000100,40010000100,2017-05-01,5,2017,00,Active,0,OG,A,10,50,2,4,0.0,5,0.0,31.0,2.0,0.0,,,,,,,404,00,00,Reported,1,100097216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270377,042832029000,42832029000,2017-05-01,5,2017,06,Idle,0,OG,A,10,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895
1270378,042832029000,42832029000,2017-06-01,6,2017,06,Idle,0,OG,A,0,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895
1270379,042832029000,42832029000,2017-07-01,7,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895
1270380,042832029000,42832029000,2017-08-01,8,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895


In [116]:
pickle_file_name = f"{load_year}_well_records.pkl"
if os.path.exists(pickle_file_name) :
    print("Reading existing wells pickle...")
    df_wells = pd.read_pickle(pickle_file_name)
else:
    # Connect to the SQL Server to read the data tables
    # Starting in v18, they default encryption, which we definately don't need for this and locally
    conn_str = f"Driver={{ODBC Driver 18 for Sql Server}};Server=WhiteFractal-i7\\SQLEXPRESS;Database={mssql_db_name};Trusted_Connection=yes;Encrypt=no;"
    conn = pyodbc.connect(conn_str)

    print(f"Reading SQL well data... {mssql_db_name}.dbo.{mssql_tbl_name}")

    # This query composites the well tables together to remove duplicate rows by SystemEntryDate
    sql_str = f"SELECT * FROM [{mssql_db_name}].[dbo].[{mssql_tbl_name}]"

    col_rename = {
        'APINumber':'api_no',
        'FieldCode':'field_code',
        'AreaCode':'area_code',
        'PoolCode':'pool_code',
        'WellTypeCode':'well_type_cd',
        'Section':'loc_section',
        'Subsection':'loc_subsection',
        'Range':'loc_range',
        'BaseMeridian':'loc_bm',
        'Township':'loc_township',
        'CountyName':'county',
        'LeaseName':'lease_name',
        'FieldName':'field_name',
        'AreaName':'area_name',
        'PoolName':'pool_name',
        'OperatorName':'operator_name',
        'OperatorStatus':'operator_status',
        'OperatorReportingMethod':'report_type',
        'WellNumber':'well_number',
        'WellStatus':'well_status',
        'DistrictNumber':'district',
        'OperatorCode':'operator_cd' }

    df_wells = pd.read_sql_query(sql_str, conn)

    # cleanup the column names from the SQL Server db - getting them into mysql format right off the bat
    df_wells.rename(columns=col_rename, inplace=True)

    # convert the section to string and drop the <NA> strings
    df_wells['loc_section'] = df_wells['loc_section'].astype('Int64').astype(str)
    df_wells.replace({'loc_section':'<NA>'}, None, inplace=True)

    # set up the 12 digit API number
    df_wells['api_no'] = "04" + df_wells['api_no'] + "00"

    # broken API number
    df_wells.replace({'api_no':'04053-223300'}, '040532233800', inplace=True)

    # save the pickle for the next run
    df_wells.to_pickle(pickle_file_name)

progress.value += 5
# print out the loaded dataframe
df_wells

Reading SQL well data... WellProductionInjectionLegacy.dbo.2017CaliforniaOilAndGasWells


  df_wells = pd.read_sql_query(sql_str, conn)


Unnamed: 0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,PWT__ID
0,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,100000001
1,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,100000002
2,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,100000003
3,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,100000004
4,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,100000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,100320508
277293,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,100320509
277294,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,100320510
277295,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,100320511


In [117]:
df_wells['loc_section'].value_counts()

loc_section
29    13227
2     12989
33    11887
34    10767
31    10674
3     10582
36     9830
4      9263
19     8979
1      8760
22     8632
35     8528
32     8284
26     7855
28     7558
6      7442
11     7281
30     7085
21     7044
14     6945
18     6600
12     6566
5      6523
27     6519
13     6366
15     6343
20     6234
8      6219
25     6196
16     6156
24     6140
17     5081
23     4877
7      4850
9      4850
10     4064
Name: count, dtype: int64

In [118]:
original_cols = df.columns
original_cols


Index(['api_no', 'api_no_int', 'prod_inj_date', 'vol_month', 'vol_year',
       'well_status_no', 'well_status', 'prod_or_inj', 'well_type_cd',
       'well_status_cd', 'casing_psi', 'tubing_psi', 'gas_btu', 'well_mo',
       'oil_api_grav', 'water_disposition', 'oil_prod_vol_bbl', 'prod_days',
       'gas_prod_vol_mcf', 'water_prod_vol_bbl', 'gas_inj_vol_mcf',
       'water_inj_vol_bbl', 'inj_days', 'surf_inj_press_psi', 'water_source',
       'water_kind', 'field_code', 'area_code', 'pool_code', 'rep_or_est',
       'rep_or_est_cd', 'PWT__ID'],
      dtype='object')

In [119]:
# setting a column type as category can allow us to organize/sort the categories manually
# i.e. df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True)
# I'm adding a new category 'Inactive' later in my calculations, so would need to add it to the category set here
# df['well_status'] = df['well_status'].astype('category')

# Set the index of the df_wells to the API number - actually doing this towards the end, just before the merge call
# Turns out the unique index is API / field_code / area_code / pool_code...
#    they didn't bother suffixing the API for a different pool for many of the old wells
#df_wells.set_index(['API','field_code','pool_code','well_type_cd'], drop=True, inplace=True, verify_integrity=True)
#print(f"Index of df_wells_from_mv: {df_wells.index}")


In [120]:
# df['prod_inj_date'] = df['prod_inj_date'].astype('date') # python dataframe calls this an object, but it seems to be correctly assinging Date datatype under the hood
print(f"datatype of {type(df['prod_inj_date'][0])}")

datatype of <class 'datetime.date'>


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1270382 entries, 0 to 1270381
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   api_no              1270382 non-null  object 
 1   api_no_int          1270382 non-null  int64  
 2   prod_inj_date       1270382 non-null  object 
 3   vol_month           1270382 non-null  int64  
 4   vol_year            1270382 non-null  int64  
 5   well_status_no      1270382 non-null  object 
 6   well_status         1270382 non-null  object 
 7   prod_or_inj         1270382 non-null  int64  
 8   well_type_cd        1270382 non-null  object 
 9   well_status_cd      1270382 non-null  object 
 10  casing_psi          1024107 non-null  Int64  
 11  tubing_psi          1023947 non-null  Int64  
 12  gas_btu             1026596 non-null  Int64  
 13  well_mo             949034 non-null   Int64  
 14  oil_api_grav        1021856 non-null  float64
 15  water_dispositi

In [122]:
df_wells.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277297 entries, 0 to 277296
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   district            277297 non-null  object        
 1   field_code          277297 non-null  object        
 2   area_code           277297 non-null  object        
 3   api_no              277297 non-null  object        
 4   well_status         277297 non-null  object        
 5   loc_section         277196 non-null  object        
 6   loc_subsection      193829 non-null  object        
 7   loc_township        277196 non-null  object        
 8   loc_range           277196 non-null  object        
 9   loc_bm              277195 non-null  object        
 10  operator_cd         277297 non-null  object        
 11  lease_name          228409 non-null  object        
 12  well_number         277297 non-null  object        
 13  field_name          277297 no

## Examine the categorical fields - these have discrete codes or strings

### investigate categoricals from the monthly volumes table

In [123]:
# Examine the report type category
aggfunc = {
    'api_no'                : pd.Series.nunique,
    'prod_days'             :'sum',
    'inj_days'              :'sum',
    'oil_prod_vol_bbl'      :'sum',
    'gas_prod_vol_mcf'      :'sum',
    'gas_inj_vol_mcf'       :'sum',
    'water_inj_vol_bbl'     :'sum'
}

df.pivot_table(index=['rep_or_est'], values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
rep_or_est,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Reported,99853,1859618000.0,2020869000.0,3810208.0,173948017.0,16594158.0,2822416000.0


In [124]:
# Examine the well status category
df.pivot_table(index='well_status', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
well_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Active,62424,1628976000.0,2013686000.0,3761986.0,173945363.0,16588910.0,2821904000.0
Idle,41772,217284800.0,7183102.0,46220.0,2653.0,5177.0,482352.0
New,252,0.0,0.0,0.0,0.0,7.0,0.0
Observ,9660,13356620.0,0.0,2002.0,0.0,28.0,29476.0
Plugged,119,0.0,0.0,0.0,1.0,36.0,0.0


In [125]:
# Examine the well type category
df.pivot_table(index='well_type_cd', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
well_type_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AI,8,0.0,0.0,0.0,0.0,0.0,0.0
DG,2240,0.0,20190380.0,0.0,36842.0,357264.0,0.0
GD,96,3532029.0,0.0,19751.0,0.0,0.0,25112.0
GS,382,1816170000.0,1818788000.0,371724.0,195999.0,254524.0,0.0
OB,3721,0.0,0.0,0.0,557.0,608.0,0.0
OG,76888,0.0,181891400.0,0.0,173714619.0,15975475.0,0.0
PM,131,39885360.0,0.0,35347.0,0.0,0.0,3673.0
SC,12297,4040.0,0.0,208630.0,0.0,0.0,133161400.0
SF,8455,0.0,0.0,1366238.0,0.0,0.0,394157800.0
WD,1592,21111.0,0.0,252810.0,0.0,0.0,687035200.0


In [126]:
df[df['well_type_cd'] == 'INJ']

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID


### Well Type Conversion Code cleanup

<a id='well_type_cleanup'></a>

<a href='#well_type_codes'>Link to Well Type Codes</a>

| No. | Well Type | Definition                             |
|:---:|:---------:|:---------------------------------------|
|  0  |    AI     | Air Injector                           |
|  1  |    DG     | Dry Gas Production                     |
|  C  |    GD     | Gas Disposal Injector                  |
|  3  |    GS     | Gas Storage Injector/Producer          |
|  5  |    LG     | Liquid Petroleum Gas Injector/Producer |
|  B  |    OB     | Observation Well                       |
|  2  |    OG     | Oil & Gas Production                   |
|  4  |    PM     | Pressure Maintenance Injector          |
|  A  |    SC     | Steam Flood Cyclic(?)                  |
|  8  |    SF     | Steam Flood Injector                   |
|  6  |    WD     | Water Disposal Injector                |
|  7  |    WF     | Water Flood Injector                   |
|  9  |    WS     | Water Source Injector                  |

In [127]:
# Destination table has well type code of 2 chars, so fix the "Multi" ones
df.loc[df['well_type_cd'] == 'Multi', 'well_type_cd'] = 'ML'
df.loc[df['well_type_cd'] == 'GAS', 'well_type_cd'] = 'DG'

# For well types Unknown, try to set it by imputation later
df.loc[df['well_type_cd'] == 'UNK', 'well_type_cd'] = None

# I'm going to call all the INJ well types Cyclic Steam for now, they were corrected in later years (ex. 040192614700 has INJ in 2019, then SF in 2024)
df.loc[df['well_type_cd'] == 'INJ', 'well_type_cd'] = 'SC'
df.pivot_table(index='well_type_cd', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
well_type_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AI,8,0.0,0.0,0.0,0.0,0.0,0.0
DG,2240,0.0,20190380.0,0.0,36842.0,357264.0,0.0
GD,96,3532029.0,0.0,19751.0,0.0,0.0,25112.0
GS,382,1816170000.0,1818788000.0,371724.0,195999.0,254524.0,0.0
OB,3721,0.0,0.0,0.0,557.0,608.0,0.0
OG,76888,0.0,181891400.0,0.0,173714619.0,15975475.0,0.0
PM,131,39885360.0,0.0,35347.0,0.0,0.0,3673.0
SC,12297,4040.0,0.0,208630.0,0.0,0.0,133161400.0
SF,8455,0.0,0.0,1366238.0,0.0,0.0,394157800.0
WD,1592,21111.0,0.0,252810.0,0.0,0.0,687035200.0


In [128]:
# Examine the well method of operation category
df.pivot_table(index='well_mo', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
well_mo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6668,240028647.0,12964280.0,44820.0,251945.0,33474.0,0.0
1,4778,928179093.0,1106318000.0,131868.0,12491438.0,1064591.0,0.0
2,14,0.0,0.0,0.0,3727.0,757.0,0.0
3,61994,0.0,139205600.0,0.0,134245607.0,13750743.0,0.0
4,606,0.0,399650.0,0.0,1048502.0,139103.0,0.0
5,523,0.0,1553398.0,0.0,2435226.0,121429.0,0.0
6,2273,0.0,13868980.0,0.0,15635552.0,653933.0,0.0
7,177,0.0,1192985.0,0.0,163214.0,16297.0,0.0
8,3850,0.0,2045472.0,0.0,6312819.0,611251.0,0.0


In [129]:
# Examine the water disposal category
df.pivot_table(index='water_disposition', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
water_disposition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,5324,628798791.0,351223425.0,95621.0,1339310.0,127594.0,0.0
1,1912,32917577.0,7327006.0,1753.0,3519289.0,359134.0,0.0
2,96,0.0,0.0,0.0,141952.0,25512.0,0.0
3,2412,0.0,541428.0,0.0,5053990.0,694611.0,0.0
4,635,5841173.0,5881179.0,7783.0,924969.0,121147.0,0.0
5,52974,334649587.0,571469661.0,45634.0,138398401.0,12832238.0,0.0
6,7941,174166040.0,352019010.0,27128.0,16799509.0,1711116.0,0.0
8,13,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
# Examine the water kind category
df.pivot_table(index='water_kind', values=['api_no','prod_days','inj_days','oil_prod_vol_bbl','gas_prod_vol_mcf','gas_inj_vol_mcf','water_inj_vol_bbl'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no,gas_inj_vol_mcf,gas_prod_vol_mcf,inj_days,oil_prod_vol_bbl,prod_days,water_inj_vol_bbl
water_kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,919,946517116.0,814127013.0,165085.0,195980.0,74711.0,2367776.0
1,12816,39669732.0,0.0,2265598.0,0.0,0.0,2100605000.0
2,2318,4040.0,0.0,299678.0,0.0,0.0,245070100.0
3,9477,0.0,0.0,627634.0,0.0,0.0,333828900.0
4,1046,430873.0,0.0,113181.0,0.0,0.0,33891200.0


In [131]:
progress.value += 1

### investigate categoricals in the wells table

In [132]:
aggfunc = {'api_no':pd.Series.nunique}
df_wells.pivot_table(index='well_type_cd', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
well_type_cd,Unnamed: 1_level_1
AI,201
DG,6025
GD,165
GS,561
LG,7
OB,5722
OG,156574
PM,353
SC,62727
SF,18441


In [133]:
# Same fix for wells as the MV data above, Destination table has well type code of 2 chars, so fix the "Multi" ones
df_wells.loc[df_wells['well_type_cd'] == 'Multi', 'well_type_cd'] = 'ML'
df_wells.loc[df_wells['well_type_cd'] == 'GAS', 'well_type_cd'] = 'DG'

# I'm going to call all the INJ well types Cyclic Steam for now, they were corrected in later years (ex. 040192614700 has INJ in 2019, then SF in 2024)
df_wells.loc[df_wells['well_type_cd'] == 'INJ', 'well_type_cd'] = 'SC'
df_wells.loc[df_wells['well_type_cd'] == 'UNK', 'well_type_cd'] = None
df_wells.pivot_table(index='well_type_cd', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
well_type_cd,Unnamed: 1_level_1
AI,201
DG,6025
GD,165
GS,561
LG,7
OB,5722
OG,156574
PM,353
SC,62727
SF,18441


In [134]:
df_wells.pivot_table(index='PoolWellTypeStatus', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
PoolWellTypeStatus,Unnamed: 1_level_1
A,85339
B,166
C,9405
I,13954
N,4443
O,2313
P,90515


In [135]:
df_wells.pivot_table(index='well_status', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
well_status,Unnamed: 1_level_1
A,84219
B,163
C,8077
I,14395
N,3232
P,78794


In [136]:
df_wells.pivot_table(index='district', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
district,Unnamed: 1_level_1
1,19400
2,10426
3,8981
4,134163
5,10573
6,5336


In [137]:
df_wells.pivot_table(index='loc_bm', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
loc_bm,Unnamed: 1_level_1
H,69
MD,145150
SB,43622


In [138]:
df_wells.pivot_table(index='operator_status', values=['api_no'], aggfunc=aggfunc,  observed=False)

Unnamed: 0_level_0,api_no
operator_status,Unnamed: 1_level_1
A,161440
I,27439


In [139]:
progress.value += 1


## Add helper columns

### First up is the calculated `IsActive` column which is going to be based on volumes
Note that Estimated volumes ar ealso valid, particularly on injectors

**TODO**: the wells can have  production volumes across multiple api_key values, and the record type 'Estimated', so it would not be correct to have multiple well completions all reporting production days - that should really stay at the well head, aka the top level API. So need a check at the end for active_days > dats in the record month and set them to the record month day count

In [140]:
# Creating our own Active flag based on Reported volumes and activity
df['IsActive'] = (df['water_disposition']>0) | (df['oil_prod_vol_bbl']>0) | (df['gas_prod_vol_mcf']>0) | (df['water_inj_vol_bbl']>0) | (df['gas_inj_vol_mcf']>0) | (df['water_inj_vol_bbl']>0) | (df['water_source']>0)
df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive
0,040010000100,40010000100,2017-01-01,1,2017,00,Active,0,OG,A,10,50,1,4,0.0,5,9.0,31.0,1.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True
1,040010000100,40010000100,2017-02-01,2,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,9.0,28.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True
2,040010000100,40010000100,2017-03-01,3,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,3.0,7.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True
3,040010000100,40010000100,2017-04-01,4,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,2.0,20.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True
4,040010000100,40010000100,2017-05-01,5,2017,00,Active,0,OG,A,10,50,2,4,0.0,5,0.0,31.0,2.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270377,042832029000,42832029000,2017-05-01,5,2017,06,Idle,0,OG,A,10,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,
1270378,042832029000,42832029000,2017-06-01,6,2017,06,Idle,0,OG,A,0,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,
1270379,042832029000,42832029000,2017-07-01,7,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,
1270380,042832029000,42832029000,2017-08-01,8,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,


### Then add the actual `active_days` field, which is a corrected Production + Injection prod_days field

In [141]:
df['active_days'] = df['prod_days'].where(df['IsActive'] == True, 0).fillna(0) + df['inj_days'].where(df['IsActive'] == True, 0).fillna(0)
df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days
0,040010000100,40010000100,2017-01-01,1,2017,00,Active,0,OG,A,10,50,1,4,0.0,5,9.0,31.0,1.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,31.0
1,040010000100,40010000100,2017-02-01,2,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,9.0,28.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,28.0
2,040010000100,40010000100,2017-03-01,3,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,3.0,7.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,7.0
3,040010000100,40010000100,2017-04-01,4,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,2.0,20.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,20.0
4,040010000100,40010000100,2017-05-01,5,2017,00,Active,0,OG,A,10,50,2,4,0.0,5,0.0,31.0,2.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270377,042832029000,42832029000,2017-05-01,5,2017,06,Idle,0,OG,A,10,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0
1270378,042832029000,42832029000,2017-06-01,6,2017,06,Idle,0,OG,A,0,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0
1270379,042832029000,42832029000,2017-07-01,7,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0
1270380,042832029000,42832029000,2017-08-01,8,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0


### Create new status field that will be used going forward since the provided one is frequently inaccurate

In [142]:
# There are some wells that have an incorrect Idle status set while they are still injecting so overwrite that with the correct status of 'Active'
df['well_status2'] = df['well_status']
df.loc[df['IsActive'], 'well_status2'] = 'Active'
df.loc[df['IsActive']==False, 'well_status2'] = 'Inactive'
df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days,well_status2
0,040010000100,40010000100,2017-01-01,1,2017,00,Active,0,OG,A,10,50,1,4,0.0,5,9.0,31.0,1.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,31.0,Active
1,040010000100,40010000100,2017-02-01,2,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,9.0,28.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,28.0,Active
2,040010000100,40010000100,2017-03-01,3,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,3.0,7.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,7.0,Active
3,040010000100,40010000100,2017-04-01,4,2017,00,Active,0,OG,A,10,50,0,4,0.0,5,2.0,20.0,0.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,20.0,Active
4,040010000100,40010000100,2017-05-01,5,2017,00,Active,0,OG,A,10,50,2,4,0.0,5,0.0,31.0,2.0,0.0,,,,,,,404,00,00,Reported,1,100097216,True,31.0,Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270377,042832029000,42832029000,2017-05-01,5,2017,06,Idle,0,OG,A,10,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0,Idle
1270378,042832029000,42832029000,2017-06-01,6,2017,06,Idle,0,OG,A,0,14,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0,Idle
1270379,042832029000,42832029000,2017-07-01,7,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0,Idle
1270380,042832029000,42832029000,2017-08-01,8,2017,06,Idle,0,OG,A,0,18,0,6,0.0,,0.0,0.0,0.0,0.0,,,,,,,236,06,03,Reported,1,100017895,,0.0,Idle


In [143]:
progress.value += 1

# Wells table: Resolve some of the NULL values
## Use imputation if there are multiple entries for an api_no, otherwise set it to '00'

In [144]:
well_row_count = len(df_wells)
print(f"well_type_cd NULL row count {sum(df_wells['well_type_cd'].isna()):,} ({sum(df_wells['well_type_cd'].isna())/well_row_count:.0%})")

fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# First imputation, we can skip 'well_status' as a groupby since they appear to have the same NULL occurances
if sum(df_wells['well_type_cd'].isna()) > 0:
    df_wells['well_type_cd'] = df_wells['well_type_cd'].fillna(df_wells.groupby(['api_no','field_code','area_code'])['well_type_cd'].transform(fn_well_type))
    print(f"well_type_cd NULL row count {sum(df_wells['well_type_cd'].isna()):,} ({sum(df_wells['well_type_cd'].isna())/well_row_count:.1%})")
progress.value += 1

well_type_cd NULL row count 0 (0%)


In [145]:
if sum(df_wells['well_type_cd'].isna()) > 0:
    df_wells['well_type_cd'] = df_wells['well_type_cd'].fillna(df_wells.groupby(['api_no','field_code'])['well_type_cd'].transform(fn_well_type))
    print(f"well_type_cd NULL row count {sum(df_wells['well_type_cd'].isna()):,} ({sum(df_wells['well_type_cd'].isna())/well_row_count:.1%})")
progress.value += 1

In [146]:
if sum(df_wells['well_type_cd'].isna()) > 0:
    df_wells['well_type_cd'] = df_wells['well_type_cd'].fillna(df_wells.groupby(['api_no'])['well_type_cd'].transform(fn_well_type))
    print(f"well_type_cd NULL row count {sum(df_wells['well_type_cd'].isna()):,} ({sum(df_wells['well_type_cd'].isna())/well_row_count:.1%})")
progress.value += 1

In [147]:
# Fill any remaining NULLs with '00'
if sum(df_wells['well_type_cd'].isna()) > 0:
    df_wells['well_type_cd'] = df_wells['well_type_cd'].fillna('00')
    print(f"well_type_cd NULL row count {sum(df_wells['well_type_cd'].isna()):,} ({sum(df_wells['well_type_cd'].isna())/well_row_count:.1%})")
progress.value += 1

In [148]:
# Finally, make sure everything is a 2 letter code, or it will fail later b/c destination table has CHAR(2)
assert(len(df_wells[df_wells['well_type_cd'].str.len() > 2]) == 0)

# Monthly Values:  Resolve some of the NULL values by imputation

In [149]:
# There are some records that go null in the middle of the year, not sure why - going to fill NAs with the most common value for the well table
mv_row_count = len(df['api_no'])
print(f"monthly volume total row count {mv_row_count:,}")
print(f"well_type_cd NULL row count {sum(df['well_type_cd'].isna()):,} ({sum(df['well_type_cd'].isna())/mv_row_count:.0%})")
print(f"well_status NULL count {sum(df['well_status'].isna()):,} ({sum(df['well_status'].isna())/mv_row_count:.0%})")
print(f"water_disposition NULL count {sum(df['water_disposition'].isna()):,} ({sum(df['water_disposition'].isna())/mv_row_count:.0%})")
print(f"MO NULL count {sum(df['well_mo'].isna()):,} ({sum(df['well_mo'].isna())/mv_row_count:.0%})")
print(f"water_kind NULL count {sum(df['water_kind'].isna()):,} ({sum(df['water_kind'].isna())/mv_row_count:.0%})")
print(f"water_source NULL count {sum(df['water_source'].isna()):,} ({sum(df['water_source'].isna())/mv_row_count:.0%})")

monthly volume total row count 1,270,382
well_type_cd NULL row count 0 (0%)
well_status NULL count 0 (0%)
water_disposition NULL count 445,886 (35%)
MO NULL count 321,348 (25%)
water_kind NULL count 1,045,545 (82%)
water_source NULL count 1,027,639 (81%)


### Monthly Values: Impute with well_type_cd : group by api_no/field_code/area_code/pool_code
While looking through the data (when trying to see if I could do a set_index on the DataFrame),
I found that this grouping mainly identified common well data, though not 100% it reduces a lot of the monthly data noise

In [150]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# First imputation, we can skip 'well_status' as a groupby since they appear to have the same NULL occurances
if df['well_type_cd'].isna().sum() > 0:
    df['well_type_cd'] = df['well_type_cd'].fillna(df.groupby(['api_no','field_code','area_code','pool_code'])['well_type_cd'].transform(fn_well_type))
    print(f"well_type_cd NULL row count {sum(df['well_type_cd'].isna()):,} ({sum(df['well_type_cd'].isna())/mv_row_count:.0%})")

progress.value += 1

In [151]:
# This is the best it gets with this grouping by api_no and attributes, going all the way down to api_no only gives the same result
if df['well_type_cd'].isna().sum() > 0:
    df['well_type_cd'] = df['well_type_cd'].fillna(df.groupby(['api_no','field_code','area_code'])['well_type_cd'].transform(fn_well_type))
    print(f"well_type_cd NULL row count {sum(df['well_type_cd'].isna()):,} ({sum(df['well_type_cd'].isna())/mv_row_count:.0%})")

progress.value += 1

In [152]:
# Finally, make sure everything is a 2 letter code, or it will fail later b/c destination table has CHAR(2)
assert(len(df[df['well_type_cd'].str.len() > 2]) == 0)

### Monthly Values: Impute with well_status : group by api_no/field_code/area_code/pool_code

In [153]:
print(f"Current well status NULL counts: {df['well_status'].isna().sum()}")
print(f"Current well status 2 NULL counts: {df['well_status2'].isna().sum()}")
df['well_status'].value_counts()


Current well status NULL counts: 0
Current well status 2 NULL counts: 0


well_status
Active     744723
Idle       418765
Observ     104572
New          2021
Plugged       301
Name: count, dtype: int64

In [154]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# I don't want to impute well_status from other records like I did for well_type, since this is really just an indicator of whether the well is active or not
# First imputation
if df['well_status'].isna().sum() > 0:
    df['well_status'] = df['well_status'].fillna(np.where(df['IsActive'] == True, 'Active', None))
    print(f"well_status NULL row count {sum(df['well_status'].isna()):,} ({sum(df['well_status'].isna())/mv_row_count:.0%})")

progress.value += 1

### Monthly Values: Impute with water_disposition : group by api_no/field_code/area_code/pool_code

In [155]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# First imputation
df['water_disposition'] = df['water_disposition'].fillna(df.groupby(['api_no','field_code','area_code','pool_code'])['water_disposition'].transform(fn_well_type))
print(f"water_disposition NULL row count {sum(df['water_disposition'].isna()):,} ({sum(df['water_disposition'].isna())/mv_row_count:.0%})")

progress.value += 1

water_disposition NULL row count 380,848 (30%)


In [156]:
# this is as good as it gets
df['water_disposition'] = df['water_disposition'].fillna(df.groupby(['api_no','field_code','area_code'])['water_disposition'].transform(fn_well_type))
print(f"water_disposition NULL row count {sum(df['water_disposition'].isna()):,} ({sum(df['water_disposition'].isna())/mv_row_count:.0%})")
progress.value += 1

water_disposition NULL row count 380,085 (30%)


### Monthly Values: Impute with MO : group by api_no/field_code/area_code/pool_code

In [157]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# imputation
df['well_mo'] = df['well_mo'].fillna(df.groupby(['PWT__ID'])['well_mo'].transform(fn_well_type))
print(f"MO NULL row count {sum(df['PWT__ID'].isna()):,} ({sum(df['well_mo'].isna())/mv_row_count:.0%})")
progress.value += 1

MO NULL row count 0 (24%)


### Monthly Values: Impute with WATKIND : group by api_no/field_code/area_code/pool_code

In [158]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# imputation
df['water_kind'] = df['water_kind'].fillna(df.groupby(['PWT__ID'])['water_kind'].transform(fn_well_type))
print(f"water_kind NULL row count {sum(df['water_kind'].isna()):,} ({sum(df['water_kind'].isna())/mv_row_count:.0%})")
progress.value += 1

water_kind NULL row count 1,040,844 (82%)


### Monthly Values: Impute with water_source : group by api_no/field_code/area_code/pool_code

In [159]:
fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# imputation
df['water_source'] = df['water_source'].fillna(df.groupby(['PWT__ID'])['water_source'].transform(fn_well_type))
print(f"water_source NULL row count {sum(df['water_source'].isna()):,} ({sum(df['water_source'].isna())/mv_row_count:.0%})")
progress.value += 1

water_source NULL row count 1,025,884 (81%)


In [160]:
# This idea fell by the wayside since CalGEM actually just aggregates all the api_no data together for their charts etc.
# I thought I could add an api_key helper column to the wells table, to use this as a DataFrame index later but there's too much noise
# Note that the monthly vol table already has this created from the SQL
# the api_key is api_no+field_code+area_code+pool_code
#df_wells['api_key'] = df_wells['api_no'] + "-" + df_wells['field_code'] +"-" + df_wells['area_code'] + "-" + df_wells['pool_code'] + "-" + df_wells['well_type_cd']
#df_wells


In [161]:
# after we've filled in the blanks at the well level, some categorical fields with NaN can be set to zero, which means "Not Applicable"
# I need all the categoricals to not have NULLs before I do the CHANGED column calculations below, NaN throws off the CHANGED calc
df['well_status']       = df['well_status'].fillna("00")
df['well_type_cd']      = df['well_type_cd'].fillna("00") # this needs to be a string to match the rest of the
df['water_disposition'] = df['water_disposition'].fillna(0)
df['well_mo']           = df['well_mo'].fillna(0)
df['water_source']      = df['water_source'].fillna(0)
df['water_kind']        = df['water_kind'].fillna(0)

# This field will always either be Reported or Estimated. If it's empty, then Estimated.
df['rep_or_est'] = df['rep_or_est'].fillna('Estimated')

# All the categoricals should be non-null at this point, which is import for checking for changes in the next section
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1270382 entries, 0 to 1270381
Data columns (total 35 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   api_no              1270382 non-null  object 
 1   api_no_int          1270382 non-null  int64  
 2   prod_inj_date       1270382 non-null  object 
 3   vol_month           1270382 non-null  int64  
 4   vol_year            1270382 non-null  int64  
 5   well_status_no      1270382 non-null  object 
 6   well_status         1270382 non-null  object 
 7   prod_or_inj         1270382 non-null  int64  
 8   well_type_cd        1270382 non-null  object 
 9   well_status_cd      1270382 non-null  object 
 10  casing_psi          1024107 non-null  Int64  
 11  tubing_psi          1023947 non-null  Int64  
 12  gas_btu             1026596 non-null  Int64  
 13  well_mo             1270382 non-null  Int64  
 14  oil_api_grav        1021856 non-null  float64
 15  water_dispositi

In [162]:
# Add some helper columns, these will calculate when the monthly attributes change during the year
# note - first the data needs to be sorted by well ID and then date, and then we're effectively doing a Partition style calculation using python
df.sort_values(by=['PWT__ID','prod_inj_date'], inplace=True)

# a_itr should be a groupby iterator for a specific col to pivot and a col to iterate over
# a_col_name should be an existing column in the original pre-groupby DataFrame to populate, similar to what a Transform method would do
# a_df is the dataframe to make the changes into
# also everything should be sorted in the order it's going to be grouped and checked for changes; in this case, api_no and prod_inj_date
# NOTE: this methodology is obsolete
#   this takes about 20 minutes to run all the changed categories with a full year's data - using vectorized transforms takes about 4 min
def flag_changes( a_itr, a_col_name, a_df):
    idx = 0
    for i in a_itr:
        last_x = None
        s = f"{i[0]}: "
        n = 0
        for x in i[1]:
            s = f"{s} {x}"

            if n == 0:
                # first row of the group is always False
                a_df.loc[idx, a_col_name] = False
            else:
                # now do the change test
                a_df.loc[idx, a_col_name] = (x != last_x)

            # save this one for the next loop
            last_x = x
            n = n + 1
            idx = idx + 1

        if idx < 10: print(s)


df['well_status_CHANGED']   = None
df['well_type_cd_CHANGED']  = None
df['rep_or_est_CHANGED']    = None

# it's worth noting that if any of these are numeric data types, I can do the following to more quickly process:
# test_df['measure_change'] = test_df.groupby('item')['measure'].diff().fillna(0) != 0

#df['report_type_CHANGED']    = None
#it = df.groupby('api_no')['report_type']
#FlagChanges( it, 'report_type_CHANGED', df)
#df['report_type_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

#df['pool_code_CHANGED']      = None
#it = df.groupby('api_no')['pool_code']
#FlagChanges( it, 'pool_code_CHANGED', df)
#df['pool_code_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

it = df.groupby('PWT__ID')['well_status2'] # use the imputed/updated well_status2
#FlagChanges( it, 'well_status_CHANGED', df)
df['well_status_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))
progress.value += 2

it = df.groupby('PWT__ID')['well_type_cd']
#FlagChanges( it, 'well_type_cd_CHANGED', df)
df['well_type_cd_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))
progress.value += 2

#df['water_disposition_CHANGED']       = None
#it = df.groupby('api_no')['water_disposition']
#FlagChanges( it, 'water_disposition_CHANGED', df)
#df['water_disposition_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

#df['well_mo_CHANGED']            = None
#it = df.groupby('api_no')['well_mo']
#FlagChanges( it, 'well_mo_CHANGED', df)
#df['well_mo_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

#df['water_source_CHANGED']        = None
#it = df.groupby('api_no')['water_source']
#FlagChanges( it, 'water_source_CHANGED', df)
#df['water_source_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

#df['water_kind_CHANGED']       = None
#it = df.groupby('api_no')['water_kind']
#FlagChanges( it, 'water_kind_CHANGED', df)
#df['water_kind_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))

it = df.groupby('PWT__ID')['rep_or_est']
#FlagChanges( it, 'rep_or_est_CHANGED', df)
df['rep_or_est_CHANGED'] = it.transform(lambda x: (x != x.shift()) & (x.shift().notna()))
progress.value += 2

df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days,well_status2,well_status_CHANGED,well_type_cd_CHANGED,rep_or_est_CHANGED
1011366,040370080500,40370080500,2017-01-01,1,2017,00,Active,0,OG,A,22,20,0,3,39.0,4,157.0,31.0,0.0,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False
1011367,040370080500,40370080500,2017-02-01,2,2017,00,Active,0,OG,A,22,20,,3,39.0,4,142.0,28.0,,153.0,,,,,0,0,034,00,05,Reported,1,100000007,True,28.0,Active,False,False,False
1011368,040370080500,40370080500,2017-03-01,3,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False
1011369,040370080500,40370080500,2017-05-01,5,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False
1011370,040370080500,40370080500,2017-06-01,6,2017,00,Active,0,OG,A,20,18,,3,39.0,4,155.0,30.0,,164.0,,,,,0,0,034,00,05,Reported,1,100000007,True,30.0,Active,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319027,040296696000,40296696000,2017-08-01,8,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False
319028,040296696000,40296696000,2017-09-01,9,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False
319029,040296696000,40296696000,2017-10-01,10,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False
319030,040296696000,40296696000,2017-11-01,11,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False


In [163]:
df['well_type_cd_CHANGED'].value_counts()


well_type_cd_CHANGED
False    1270382
Name: count, dtype: int64

In [164]:
# remember this is well_status2
df['well_status_CHANGED'].value_counts()


well_status_CHANGED
False    1249788
True       20594
Name: count, dtype: int64

In [165]:
df['rep_or_est_CHANGED'].value_counts()

rep_or_est_CHANGED
False    1270382
Name: count, dtype: int64

### Original DOGRR data was either 0 for prod or 1 for inj record
Current data doesn't have this field, but the records are either from the Production table or the Injection table

| Code | Desc    | Explanation         |
|------|---------|---------------------|
| 0    | PROD    | Production record   |
| 1    | INJ     | Injection record    |
| 2    | Unknown | Unknown record type |

In [166]:
## Report Type Code conversion
# 0 = Production record
# 1 = Injection record
# 2 = unknown (maybe shut-in)
df['prod_inj_cd'] = "2"
df.loc[((df['water_disposition']>0) | (df['gas_inj_vol_mcf']>0) | (df['water_inj_vol_bbl']>0)) & (df['rep_or_est'] == 'Reported'), 'prod_inj_cd'] = "1"
df.loc[((df['oil_prod_vol_bbl']>0) | (df['gas_prod_vol_mcf']>0) | (df['water_inj_vol_bbl']>0) | (df['water_source']>0)) & (df['rep_or_est'] == 'Reported'), 'prod_inj_cd'] = "0"
df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days,well_status2,well_status_CHANGED,well_type_cd_CHANGED,rep_or_est_CHANGED,prod_inj_cd
1011366,040370080500,40370080500,2017-01-01,1,2017,00,Active,0,OG,A,22,20,0,3,39.0,4,157.0,31.0,0.0,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011367,040370080500,40370080500,2017-02-01,2,2017,00,Active,0,OG,A,22,20,,3,39.0,4,142.0,28.0,,153.0,,,,,0,0,034,00,05,Reported,1,100000007,True,28.0,Active,False,False,False,0
1011368,040370080500,40370080500,2017-03-01,3,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011369,040370080500,40370080500,2017-05-01,5,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011370,040370080500,40370080500,2017-06-01,6,2017,00,Active,0,OG,A,20,18,,3,39.0,4,155.0,30.0,,164.0,,,,,0,0,034,00,05,Reported,1,100000007,True,30.0,Active,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319027,040296696000,40296696000,2017-08-01,8,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319028,040296696000,40296696000,2017-09-01,9,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319029,040296696000,40296696000,2017-10-01,10,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319030,040296696000,40296696000,2017-11-01,11,2017,06,Idle,1,WD,A,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0


### WELL STATUS CODE conversion
in the new format, they use a whole word like "IDLE" where the older format used a single char like "I"

This is saved in `doggr.wells.well_status`

There is another field `doggr.wells.operator_status_cd` with codes `A`, `I`, `O` of which `O` only has 2 wells. Will have to come back to this but, that could be Active and Idle/Inactive also. I'm not sure why there are 2 fields in the original table.

| Well Status | Definition | Explanation                                                                                |
|-------------|------------|--------------------------------------------------------------------------------------------|
| N           | New        | Recently permitted, the well has not been drilled or completed.                            |
| B           | Buried     | Older Well, not abandoned to today’s standards, location of well may be approximate.       |
| U           | Unknown    | Status not yet entered from hard copy file. Wells are mostly older, pre-1976.              |
| A           | Active     | Well has been drilled and completed                                                        |
| C           | Cancelled  | Well permit was cancelled prior to drilling                                                |
| P           | Plugged    | Well gas been plugged and abandoned                                                        |
| I           | Idle       | Idle Well. An idle well has not produced or injected for 6 consecutive months for 2 years. |


In [167]:
# unknown status types found in the data:
# Abeyance
# PluggedOnly
# also they misspelled Cancelled...

# Create an overall Well Status single character code for the 'wells' table, which is one of the codes above
# there may be a faster way to do this, but this is easy to read
df_wells['well_status_cd'] = 'U' # start with default of Unknown
df_wells.loc[df_wells['well_status'] == 'Plugged',  'well_status_cd'] = 'P'
df_wells.loc[df_wells['well_status'] == 'Idle',     'well_status_cd'] = 'I'
df_wells.loc[df_wells['well_status'] == 'Buried',   'well_status_cd'] = 'B'
df_wells.loc[df_wells['well_status'] == 'Active',   'well_status_cd'] = 'A'
df_wells.loc[df_wells['well_status'] == 'Canceled', 'well_status_cd'] = 'C'
df_wells.loc[df_wells['well_status'] == 'New',      'well_status_cd'] = 'N'
df_wells

Unnamed: 0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,PWT__ID,well_status_cd
0,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,100000001,U
1,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,100000002,U
2,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,100000003,U
3,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,100000004,U
4,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,100000005,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,100320508,U
277293,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,100320509,U
277294,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,100320510,U
277295,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,100320511,U


In [168]:
# Create an overall Well Status single character code for the 'monthly volumes' table, which is one of the codes above
# there may be a faster way to do this, but this is easy to read
df['well_status_cd'] = 'U' # start with default of Unknown
df.loc[df['well_status'] == 'Plugged',  'well_status_cd'] = 'P'
df.loc[df['well_status'] == 'Idle',     'well_status_cd'] = 'I'
df.loc[df['well_status'] == 'Buried',   'well_status_cd'] = 'B'
df.loc[df['well_status'] == 'Active',   'well_status_cd'] = 'A'
df.loc[df['well_status'] == 'Canceled', 'well_status_cd'] = 'C'
df.loc[df['well_status'] == 'New',      'well_status_cd'] = 'N'
df

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days,well_status2,well_status_CHANGED,well_type_cd_CHANGED,rep_or_est_CHANGED,prod_inj_cd
1011366,040370080500,40370080500,2017-01-01,1,2017,00,Active,0,OG,A,22,20,0,3,39.0,4,157.0,31.0,0.0,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011367,040370080500,40370080500,2017-02-01,2,2017,00,Active,0,OG,A,22,20,,3,39.0,4,142.0,28.0,,153.0,,,,,0,0,034,00,05,Reported,1,100000007,True,28.0,Active,False,False,False,0
1011368,040370080500,40370080500,2017-03-01,3,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011369,040370080500,40370080500,2017-05-01,5,2017,00,Active,0,OG,A,20,18,,3,39.0,4,159.0,31.0,,170.0,,,,,0,0,034,00,05,Reported,1,100000007,True,31.0,Active,False,False,False,0
1011370,040370080500,40370080500,2017-06-01,6,2017,00,Active,0,OG,A,20,18,,3,39.0,4,155.0,30.0,,164.0,,,,,0,0,034,00,05,Reported,1,100000007,True,30.0,Active,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319027,040296696000,40296696000,2017-08-01,8,2017,06,Idle,1,WD,I,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319028,040296696000,40296696000,2017-09-01,9,2017,06,Idle,1,WD,I,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319029,040296696000,40296696000,2017-10-01,10,2017,06,Idle,1,WD,I,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0
319030,040296696000,40296696000,2017-11-01,11,2017,06,Idle,1,WD,I,,,,0,,0,,,,,0.0,0.0,0.0,0,7,1,464,00,23,Reported,1,100320511,True,0.0,Active,False,False,False,0


### WELL TYPE Conversion Codes

| No. | Well Type | Definition                             |
|:---:|:---------:|:---------------------------------------|
|  0  |    AI     | Air Injector                           |
|  1  |    DG     | Dry Gas Production                     |
|  C  |    GD     | Gas Disposal Injector                  |
|  3  |    GS     | Gas Storage Injector/Producer          |
|  5  |    LG     | Liquid Petroleum Gas Injector/Producer |
|  B  |    OB     | Observation Well                       |
|  2  |    OG     | Oil & Gas Production                   |
|  4  |    PM     | Pressure Maintenance Injector          |
|  A  |    SC     | Steam Flood Cyclic(?)                  |
|  8  |    SF     | Steam Flood Injector                   |
|  6  |    WD     | Water Disposal Injector                |
|  7  |    WF     | Water Flood Injector                   |
|  9  |    WS     | Water Source Injector                  |

In [169]:

# Unknown codes
#  DH - could be Dry Hole?
#  Multi - ???
#  UNK - literally...
#  GAS - really?
#  INJ - really?

# Type Code conversions
df_wells['well_type_no'] = 'X'  # start with default of unknown
df_wells.loc[df_wells['well_type_cd'] == 'AI', 'well_type_no'] = '0'
df_wells.loc[df_wells['well_type_cd'] == 'DG', 'well_type_no'] = '1'
df_wells.loc[df_wells['well_type_cd'] == 'GD', 'well_type_no'] = 'C'
df_wells.loc[df_wells['well_type_cd'] == 'GS', 'well_type_no'] = '3'
df_wells.loc[df_wells['well_type_cd'] == 'LG', 'well_type_no'] = '5'
df_wells.loc[df_wells['well_type_cd'] == 'OB', 'well_type_no'] = 'B'
df_wells.loc[df_wells['well_type_cd'] == 'OG', 'well_type_no'] = '2'
df_wells.loc[df_wells['well_type_cd'] == 'PM', 'well_type_no'] = '4'
df_wells.loc[df_wells['well_type_cd'] == 'SC', 'well_type_no'] = 'A'
df_wells.loc[df_wells['well_type_cd'] == 'SF', 'well_type_no'] = '8'
df_wells.loc[df_wells['well_type_cd'] == 'WD', 'well_type_no'] = '6'
df_wells.loc[df_wells['well_type_cd'] == 'WF', 'well_type_no'] = '7'
df_wells.loc[df_wells['well_type_cd'] == 'WS', 'well_type_no'] = '9'
df_wells

Unnamed: 0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,PWT__ID,well_status_cd,well_type_no
0,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,100000001,U,2
1,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,100000002,U,2
2,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,100000003,U,2
3,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,100000004,U,2
4,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,100000005,U,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,100320508,U,2
277293,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,100320509,U,A
277294,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,100320510,U,6
277295,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,100320511,U,6


In [170]:
# post-2018 operator status codes, translating them to single char for wells table
# Active
# Bankruptcy/Receivership
# Inactive
# Unknown

df_wells['operator_status_cd'] = None
df_wells.loc[df_wells['operator_status'] == 'Active', 'operator_status_cd'] = 'A'
df_wells.loc[df_wells['operator_status'] == 'Bankruptcy/Receivership', 'operator_status_cd'] = 'B'
df_wells.loc[df_wells['operator_status'] == 'Inactive', 'operator_status_cd'] = 'I'
df_wells.loc[df_wells['operator_status'] == 'Unknown', 'operator_status_cd'] = 'U'
df_wells

Unnamed: 0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,PWT__ID,well_status_cd,well_type_no,operator_status_cd
0,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,100000001,U,2,
1,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,100000002,U,2,
2,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,100000003,U,2,
3,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,100000004,U,2,
4,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,100000005,U,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,100320508,U,2,
277293,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,100320509,U,A,
277294,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,100320510,U,6,
277295,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,100320511,U,6,


In [171]:
# district codes - CalGEM redid the districts into 4, where DOGGR used to have 6
df_wells['dist_no'] = 0
df_wells.loc[df_wells['district'] == 'Coastal', 'dist_no'] = 3
df_wells.loc[df_wells['district'] == 'Inland', 'dist_no'] = 4
df_wells.loc[df_wells['district'] == 'Northern', 'dist_no'] = 6
df_wells.loc[df_wells['district'] == 'Southern', 'dist_no'] = 1
df_wells


Unnamed: 0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,PWT__ID,well_status_cd,well_type_no,operator_status_cd,dist_no
0,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,100000001,U,2,,0
1,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,100000002,U,2,,0
2,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,100000003,U,2,,0
3,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,100000004,U,2,,0
4,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,100000005,U,2,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,100320508,U,2,,0
277293,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,100320509,U,A,,0
277294,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,100320510,U,6,,0
277295,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,100320511,U,6,,0


In [172]:
progress.value += 2

In [173]:
df_wells.columns

Index(['district', 'field_code', 'area_code', 'api_no', 'well_status',
       'loc_section', 'loc_subsection', 'loc_township', 'loc_range', 'loc_bm',
       'operator_cd', 'lease_name', 'well_number', 'field_name', 'area_name',
       'operator_name', 'operator_status', 'report_type', 'county',
       'pool_code', 'well_type_cd', 'PoolWellTypeStatus', 'SystemEntryDate',
       'pool_name', 'PWT__ID', 'well_status_cd', 'well_type_no',
       'operator_status_cd', 'dist_no'],
      dtype='object')

In [174]:
df.columns

Index(['api_no', 'api_no_int', 'prod_inj_date', 'vol_month', 'vol_year',
       'well_status_no', 'well_status', 'prod_or_inj', 'well_type_cd',
       'well_status_cd', 'casing_psi', 'tubing_psi', 'gas_btu', 'well_mo',
       'oil_api_grav', 'water_disposition', 'oil_prod_vol_bbl', 'prod_days',
       'gas_prod_vol_mcf', 'water_prod_vol_bbl', 'gas_inj_vol_mcf',
       'water_inj_vol_bbl', 'inj_days', 'surf_inj_press_psi', 'water_source',
       'water_kind', 'field_code', 'area_code', 'pool_code', 'rep_or_est',
       'rep_or_est_cd', 'PWT__ID', 'IsActive', 'active_days', 'well_status2',
       'well_status_CHANGED', 'well_type_cd_CHANGED', 'rep_or_est_CHANGED',
       'prod_inj_cd'],
      dtype='object')

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1270382 entries, 1011366 to 319031
Data columns (total 39 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   api_no                1270382 non-null  object 
 1   api_no_int            1270382 non-null  int64  
 2   prod_inj_date         1270382 non-null  object 
 3   vol_month             1270382 non-null  int64  
 4   vol_year              1270382 non-null  int64  
 5   well_status_no        1270382 non-null  object 
 6   well_status           1270382 non-null  object 
 7   prod_or_inj           1270382 non-null  int64  
 8   well_type_cd          1270382 non-null  object 
 9   well_status_cd        1270382 non-null  object 
 10  casing_psi            1024107 non-null  Int64  
 11  tubing_psi            1023947 non-null  Int64  
 12  gas_btu               1026596 non-null  Int64  
 13  well_mo               1270382 non-null  Int64  
 14  oil_api_grav          1021856 non-

In [176]:
print(f"NULL count for df_wells['well_type_cd']: {df_wells['well_type_cd'].isna().sum():,}")

NULL count for df_wells['well_type_cd']: 0


In [177]:
# At this point, there shouldn't be any NULLs in these features
print(f"NULL PWT__ID: {df_wells['PWT__ID'].isna().sum():,}")
print(f"NULL api_no: {df_wells['api_no'].isna().sum():,}")
print(f"NULL field_code: {df_wells['field_code'].isna().sum():,}")
print(f"NULL area_code: {df_wells['area_code'].isna().sum():,}")
print(f"NULL pool_code: {df_wells['pool_code'].isna().sum():,}")
print(f"NULL well_type_cd: {df_wells['well_type_cd'].isna().sum():,}")
print(f"NULL well_status_cd: {df_wells['well_status_cd'].isna().sum():,}")

NULL PWT__ID: 0
NULL api_no: 0
NULL field_code: 0
NULL area_code: 0
NULL pool_code: 0
NULL well_type_cd: 0
NULL well_status_cd: 0


In [178]:
# At this point, there shouldn't be any NULLs in these features
print(f"NULL PWT__ID: {df['PWT__ID'].isna().sum():,}")
print(f"NULL api_no: {df['api_no'].isna().sum():,}")
print(f"NULL field_code: {df['field_code'].isna().sum():,}")
print(f"NULL area_code: {df['area_code'].isna().sum():,}")
print(f"NULL pool_code: {df['pool_code'].isna().sum():,}")
print(f"NULL well_type_cd: {df['well_type_cd'].isna().sum():,}")
print(f"NULL well_status_cd: {df['well_status'].isna().sum():,}")

NULL PWT__ID: 0
NULL api_no: 0
NULL field_code: 0
NULL area_code: 0
NULL pool_code: 0
NULL well_type_cd: 0
NULL well_status_cd: 0


## Imputation & NULL checks finished, now create the indexes

In [179]:
# Figure out what combination of attributes identifies a well uniquely between the 2 tables
# I'm not convinced this is necessary, I may be trying to preserve too much information

# NOTE to future self - there are well table entries that have duplicate 'SystemEntryDate' at different times. so the partition query in the original load removes those
# Otherwise, for 2019 data at least, this gives a unique set of well rows for inserting, and combining with the monthly data
df_wells['APIKey1'] = df_wells['api_no']  + "-" + df_wells['field_code']  + "-" + df_wells['area_code']  + "-" + df_wells['pool_code']  + "-" + df_wells['well_type_cd']
#df_wells['APIKey2'] = df_wells['APIKey1'] + "-" + df_wells['well_status_cd']

# Same for the monthly values, although these won't be unique row indices because of the dates
# This will be used to pivot and then join some data together later, as well as receive the UNIQUE ID/PRIMARY KEY from MySQL on insert of the well
df['APIKey1'] = df['api_no']  + "-" + df['field_code']  + "-" + df['area_code']  + "-" + df['pool_code']  + "-" + df['well_type_cd']

progress.value += 2

In [180]:
# Confirm we don't have any NULL keys/indexes
print(f"Count of NULL df['PWT__ID'] values: {df['PWT__ID'].isna().sum():,}")
assert(df['PWT__ID'].isna().sum() == 0)

Count of NULL df['PWT__ID'] values: 0


In [181]:
# Show the rows where APIKey1 != APIKey2, I can probably consolidate these somehow
assert(df_wells['PWT__ID'].isna().sum() == 0)
df_wells.set_index('PWT__ID', verify_integrity=True, inplace=True)

In [182]:
# Set this index, this confirms that we have a unique set of wells
df_wells

Unnamed: 0_level_0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,well_status_cd,well_type_no,operator_status_cd,dist_no,APIKey1
PWT__ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
100000001,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,U,2,,0,040370049400-000-00-00-OG
100000002,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,U,2,,0,040370116700-000-00-00-OG
100000003,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,U,2,,0,040712006500-000-00-00-OG
100000004,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,U,2,,0,040712006000-000-00-00-OG
100000005,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,U,2,,0,040712006100-000-00-00-OG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100320508,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,U,2,,0,040306375500-340-00-05-OG
100320509,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,U,A,,0,040306375500-340-00-05-SC
100320510,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,U,6,,0,040296988200-464-00-23-WD
100320511,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,U,6,,0,040296696000-464-00-23-WD


In [183]:
df[df['PWT__ID'] == 100000006]

Unnamed: 0,api_no,api_no_int,prod_inj_date,vol_month,vol_year,well_status_no,well_status,prod_or_inj,well_type_cd,well_status_cd,casing_psi,tubing_psi,gas_btu,well_mo,oil_api_grav,water_disposition,oil_prod_vol_bbl,prod_days,gas_prod_vol_mcf,water_prod_vol_bbl,gas_inj_vol_mcf,water_inj_vol_bbl,inj_days,surf_inj_press_psi,water_source,water_kind,field_code,area_code,pool_code,rep_or_est,rep_or_est_cd,PWT__ID,IsActive,active_days,well_status2,well_status_CHANGED,well_type_cd_CHANGED,rep_or_est_CHANGED,prod_inj_cd,APIKey1


In [184]:
df_wells.loc[100000006]

district                                      1
field_code                                  012
area_code                                    00
api_no                             040370080000
well_status                                   P
loc_section                                  22
loc_subsection                                 
loc_township                                 3S
loc_range                                   14W
loc_bm                                       SB
operator_cd                               G2810
lease_name                          Park Bodger
well_number                                   1
field_name                        Alondra (ABD)
area_name                              Any Area
operator_name             Grayson Service, Inc.
operator_status                               A
report_type                                  50
county                              Los Angeles
pool_code                                    00
well_type_cd                            

In [185]:
df['PWT__ID'].nunique()

113468

In [186]:
df_wells.index.nunique()

277297

In [187]:
progress.value += 1

# Do 2 Joins, one left to df_wells with pivotted df, and one left to df from df_wells.
The `df_wells_from_mv` will be the used one, and combines monthly total data from `df` and then is added to `df_wells` for inserting into the db. The other join `df_from_wells` is a test join to see if any monthly records don't have an equivalent `df_wells` entry, so those would be orphaned monthly records.

In [188]:
# Original code for 2018-current datasets uses a homebrew api_key which I went to a lot of trouble to make unique.
# since we have PWT__ID already as a unique ID for 2015-2017, this is a lot simpler
#df_from_wells = pd.merge(df, df_wells, how='left', left_on='PWT__ID', right_index=True)
progress.value += 1
#df_from_wells

In [189]:
#start_null_key_count = df_from_wells['PWT__ID'].isnull().sum()

In [190]:
#print(f"Missing rows of monthly data from wells join: {start_null_key_count:,} ({start_null_key_count/len(df_from_wells):.1%})")

## Solved some of the missing data in the SQL query, but the rest needs to be imputed
Going to set `PWT__ID` to NULL for rows missing joined `df_wells` data and then impute the correct `PWT__ID`
Note: Since this behavior was set up originally for the APIKey, it shouldn't apply for `PWT__ID` since there are no NULLs there

In [191]:
#missing_pwtid_list = list(df_from_wells[df_from_wells['PWT__ID'].isnull()]['PWT__ID'].unique())
progress.value += 1
#print(f"Missing unique keys: {len(missing_pwtid_list):,}")

In [192]:
# set those missing APIKey1 to NULL in df so we can easily impute the correct APIKey1:
#if len(missing_pwtid_list) > 0 :
#    df['PWT__ID'] = df['PWT__ID'].replace(missing_pwtid_list, pd.NA)
#    impute_null_key_count = df['PWT__ID'].isnull().sum()
#    print(f"NULL Count of PWT__ID, should be the same number of rows above: {impute_null_key_count:,}")
#    assert(impute_null_key_count == start_null_key_count)
progress.value += 3

In [193]:
# The simplest way to do an Excel style XLOOKUP is with a pd.merge() join call
#df = pd.merge(df, df_wells.reset_index()[['api_no','field_code','area_code','pool_code','APIKey1']],
#         how='left',
#         left_on=['api_no','field_code','area_code','pool_code'],
#         right_on=['api_no','field_code','area_code','pool_code'])
#
#df


### `df` is now ready for imputing `PWT__ID` to achieve full match to df_wells
Note that here I'm imputing the missing `PWT__ID` values in `df` directly from `df_wells`

In [194]:
#fn_well_type = lambda x: x.mode()[0] if not x.mode().empty else np.nan # This is a lambda function to be used with the transform method on the pandas dataframe

# First imputation the usuall way, this will fill in api_key values where there was a change in the monthly data
#df['PWT__ID2'] = df['PWT__ID'].fillna(df.groupby(['api_no','field_code','area_code','pool_code'])['PWT__ID'].transform(fn_well_type))
#print(f"PWT__ID2 NULL row count {sum(df['PWT__ID2'].isna()):,}")
progress.value += 5

In [195]:
# Then second imputation looking up api_key from df_wells, using where() method
# The simplest way to do an Excel style XLOOKUP is with a pd.merge() join call
#df = pd.merge(df, df_wells.reset_index()[['api_no','field_code','area_code','pool_code','APIKey1']],
#         how='left',
#         left_on=['api_no','field_code','area_code','pool_code'],
#         right_on=['api_no','field_code','area_code','pool_code'])
#
#assert(df['APIKey1_y'].isna().sum() == 0)
progress.value += 1

In [196]:
# now we can cleanup the APIKey1
#df['APIKey1'] = np.where(df['APIKey2'].isnull(), df['APIKey1_y'], df['APIKey2'])

In [197]:
#df.columns

In [198]:
#assert(df['APIKey1'].isna().sum()==0)

# if we've cleaned up all the NULL keys, then drop the helper columns
#df.drop(columns=['APIKey1_x', 'APIKey2', 'APIKey1_y'], inplace=True)
progress.value += 1

In [199]:
# Create the final loadable "wells" dataset - a wells set with 1 row per well and the monthly dataset, which is basically "df" without the helper columns
# recreate the df_wells dataframe for this purpose
# in the situation of categoricals, I'm going to take the last one ordered by date (so typically whatever was in Dec of the year)

# Create the aggregate data from the monthly volumes table for adding to the wells table
# The api_key created and aligned earlier will now be the index for joining
df_wells_from_mv = (df.groupby(['PWT__ID'], dropna=False, observed=True).agg(
    oil_prod_vol_bbl=('oil_prod_vol_bbl','sum'),
    gas_prod_vol_mcf=('gas_prod_vol_mcf','sum'),
    gas_inj_vol_mcf=('gas_inj_vol_mcf','sum'),
    water_inj_vol_bbl=('water_inj_vol_bbl','sum'),
    active_days=('active_days','sum'),
    last_well_mo=('well_mo','last'),
    prod_inj_cd=('prod_inj_cd', 'last'),
    well_status_cng=('well_status_CHANGED','max'),  # max here will return True if any value is True, otherwise False if all False
    well_type_cng=('well_type_cd_CHANGED','max'))      # max here will return True if any value is True, otherwise False if all False
)

# Since there is one file/database per year, we can just take the vol_year and set it for all the records
df_wells_from_mv['YEAR_INT'] = df['vol_year'].max()
progress.value += 1

df_wells_from_mv


Unnamed: 0_level_0,oil_prod_vol_bbl,gas_prod_vol_mcf,gas_inj_vol_mcf,water_inj_vol_bbl,active_days,last_well_mo,prod_inj_cd,well_status_cng,well_type_cng,YEAR_INT
PWT__ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100000007,1718.0,0.0,0.0,0.0,335.0,3,0,False,False,2017
100000008,159.0,0.0,0.0,0.0,111.0,4,0,False,False,2017
100000009,1877.0,0.0,0.0,0.0,334.0,3,0,False,False,2017
100000010,201.0,0.0,0.0,0.0,120.0,4,1,True,False,2017
100000011,1564.0,0.0,0.0,0.0,335.0,3,0,False,False,2017
...,...,...,...,...,...,...,...,...,...,...
100320442,0.0,0.0,0.0,0.0,0.0,0,0,False,False,2017
100320446,0.0,0.0,0.0,0.0,0.0,0,0,False,False,2017
100320447,0.0,0.0,0.0,0.0,0.0,0,0,False,False,2017
100320510,0.0,0.0,0.0,570700.0,364.0,0,0,False,False,2017


In [200]:
print(f"Index of df_wells_from_mv: {df_wells_from_mv.index.names}")
list(df_wells_from_mv.columns)

Index of df_wells_from_mv: ['PWT__ID']


['oil_prod_vol_bbl',
 'gas_prod_vol_mcf',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'active_days',
 'last_well_mo',
 'prod_inj_cd',
 'well_status_cng',
 'well_type_cng',
 'YEAR_INT']

In [201]:
# Confirm that the pivot table now has unique rows per APIKey1 (which is now the index)
print(f"Number of non-unique APIs : {len(df_wells_from_mv) - df_wells_from_mv.index.nunique():,}")
assert(len(df_wells_from_mv) == df_wells_from_mv.index.nunique())

Number of non-unique APIs : 0


In [202]:
# now join the summary stats from the mv table for insert into the wells table
df_wells_sql = pd.merge(df_wells, df_wells_from_mv, how='left', left_on='PWT__ID', right_index=True)
df_wells_sql

Unnamed: 0_level_0,district,field_code,area_code,api_no,well_status,loc_section,loc_subsection,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,operator_status,report_type,county,pool_code,well_type_cd,PoolWellTypeStatus,SystemEntryDate,pool_name,well_status_cd,well_type_no,operator_status_cd,dist_no,APIKey1,oil_prod_vol_bbl,gas_prod_vol_mcf,gas_inj_vol_mcf,water_inj_vol_bbl,active_days,last_well_mo,prod_inj_cd,well_status_cng,well_type_cng,YEAR_INT
PWT__ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
100000001,1,000,00,040370049400,I,4,,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,I,50,Los Angeles,00,OG,I,1987-09-01,No Pool Breakdown,U,2,,0,040370049400-000-00-00-OG,,,,,,,,,,
100000002,1,000,00,040370116700,B,4,,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,I,50,Los Angeles,00,OG,B,1976-04-01,No Pool Breakdown,U,2,,0,040370116700-000-00-00-OG,,,,,,,,,,
100000003,1,000,00,040712006500,P,29,,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,I,50,San Bernardino,00,OG,P,1993-04-01,No Pool Breakdown,U,2,,0,040712006500-000-00-00-OG,,,,,,,,,,
100000004,1,000,00,040712006000,A,26,,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,I,1986-07-01,No Pool Breakdown,U,2,,0,040712006000-000-00-00-OG,,,,,,,,,,
100000005,1,000,00,040712006100,P,28,,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,I,50,San Bernardino,00,OG,P,1986-07-01,No Pool Breakdown,U,2,,0,040712006100-000-00-00-OG,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100320508,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,OG,N,2018-04-05,Kern River,U,2,,0,040306375500-340-00-05-OG,,,,,,,,,,
100320509,4,340,00,040306375500,N,5,,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,A,99,Kern,05,SC,N,2018-04-05,Kern River,U,A,,0,040306375500-340-00-05-SC,,,,,,,,,,
100320510,4,464,00,040296988200,A,24,,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1983-10-01,Tulare-San Joaquin,U,6,,0,040296988200-464-00-23-WD,0.0,0.0,0.0,570700.0,364.0,0,0,False,False,2017.0
100320511,4,464,00,040296696000,A,24,,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,A,99,Kern,23,WD,A,1982-05-01,Tulare-San Joaquin,U,6,,0,040296696000-464-00-23-WD,0.0,0.0,0.0,0.0,0.0,0,0,False,False,2017.0


In [203]:
# The non-nonunique should now be zero
print(f"Number of non-unique APIs : {len(df_wells_sql) - df_wells_sql.index.nunique():,}")
assert(len(df_wells_sql) == df_wells_sql.index.nunique())

Number of non-unique APIs : 0


In [204]:
# Purging unnecessary columns from the wells table
# district: We don't need a string version of
# SubSection : is NaN for all the original data anyway
# operator_status : Don't need full text, will rename operator_status_cd to 'operator_status' next
# well_status : We don't need a string version of
# PoolWellTypeStatus : We don't need pool status... where it's populated, it's basically meaningless
# SystemEntryDate : probably when the operator uploaded to CalGEM, don't care
# row_num : is an artefact from the SQL query
df_wells_sql.reset_index(inplace=True)
df_wells_sql.drop(columns=['district','operator_status','loc_subsection','well_status', 'PoolWellTypeStatus','SystemEntryDate'], inplace=True)
progress.value += 1

In [205]:
list(df_wells_sql.columns)

['PWT__ID',
 'field_code',
 'area_code',
 'api_no',
 'loc_section',
 'loc_township',
 'loc_range',
 'loc_bm',
 'operator_cd',
 'lease_name',
 'well_number',
 'field_name',
 'area_name',
 'operator_name',
 'report_type',
 'county',
 'pool_code',
 'well_type_cd',
 'pool_name',
 'well_status_cd',
 'well_type_no',
 'operator_status_cd',
 'dist_no',
 'APIKey1',
 'oil_prod_vol_bbl',
 'gas_prod_vol_mcf',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'active_days',
 'last_well_mo',
 'prod_inj_cd',
 'well_status_cng',
 'well_type_cng',
 'YEAR_INT']

In [206]:
# set up the monthly volumes tables as well. lots of helper columns we don't need to INSERT
df_sql = df.drop(columns=['prod_or_inj','prod_inj_cd','area_code','field_code','pool_code','IsActive','well_status_CHANGED','well_type_cd_CHANGED','rep_or_est_CHANGED','well_status2'])

df_sql = df_sql.rename(columns={
    'APIKey1':'api_key'
})

progress.value += 1
list(df_sql.columns)

['api_no',
 'api_no_int',
 'prod_inj_date',
 'vol_month',
 'vol_year',
 'well_status_no',
 'well_status',
 'well_type_cd',
 'well_status_cd',
 'casing_psi',
 'tubing_psi',
 'gas_btu',
 'well_mo',
 'oil_api_grav',
 'water_disposition',
 'oil_prod_vol_bbl',
 'prod_days',
 'gas_prod_vol_mcf',
 'water_prod_vol_bbl',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'inj_days',
 'surf_inj_press_psi',
 'water_source',
 'water_kind',
 'rep_or_est',
 'rep_or_est_cd',
 'PWT__ID',
 'active_days',
 'api_key']

In [207]:
#df_sql.drop(columns=['well_id_x','well_id_y'], inplace=True)

# Finished building `df_wells` table, SQL time
Now I'll do a few checks to make sure there aren't duplicate rows, and then it can inserted into the `wells` table (or `wells_test` if it's a test run)

In [208]:
# Check the df_wells dataframe for unique row count on the API number
print(f"Number of unique APIs from wells table: {len(df_wells_sql):,}")
assert(len(df_wells_sql) == df_wells.index.nunique())

Number of unique APIs from wells table: 277,297


In [209]:
# rename some of the CalGEM fields to the original well table field names
# these mainly come from the SELECT * in the beginning, updated some names for consistency
# first we'll put the api_key index back to a column so it gets picked up in the SQL insert later
df_wells_sql.rename(columns={
    'APIKey1':'api_key',
    'Section':'loc_section',
#    'loc_subsection':'loc_subsec',
    'Range':'loc_range',
    'field_code':'field_cd',
    'area_code':'area_cd',
    'pool_code':'pool_cd'
}, inplace=True)

progress.value += 1
list(df_wells_sql.columns)


['PWT__ID',
 'field_cd',
 'area_cd',
 'api_no',
 'loc_section',
 'loc_township',
 'loc_range',
 'loc_bm',
 'operator_cd',
 'lease_name',
 'well_number',
 'field_name',
 'area_name',
 'operator_name',
 'report_type',
 'county',
 'pool_cd',
 'well_type_cd',
 'pool_name',
 'well_status_cd',
 'well_type_no',
 'operator_status_cd',
 'dist_no',
 'api_key',
 'oil_prod_vol_bbl',
 'gas_prod_vol_mcf',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'active_days',
 'last_well_mo',
 'prod_inj_cd',
 'well_status_cng',
 'well_type_cng',
 'YEAR_INT']

In [210]:
print(f"Number of unique APIs from monthly volumes: {df_wells_from_mv.index.nunique():,}")

Number of unique APIs from monthly volumes: 113,468


In [211]:
# api_no is now the 1st index of the multi-index created above
# checking number of rows equals number of unique wells, so we don't have any double entries
assert(len(df_wells_from_mv) == df_wells_from_mv.index.nunique())

In [212]:
df_wells_sql

Unnamed: 0,PWT__ID,field_cd,area_cd,api_no,loc_section,loc_township,loc_range,loc_bm,operator_cd,lease_name,well_number,field_name,area_name,operator_name,report_type,county,pool_cd,well_type_cd,pool_name,well_status_cd,well_type_no,operator_status_cd,dist_no,api_key,oil_prod_vol_bbl,gas_prod_vol_mcf,gas_inj_vol_mcf,water_inj_vol_bbl,active_days,last_well_mo,prod_inj_cd,well_status_cng,well_type_cng,YEAR_INT
0,100000001,000,00,040370049400,4,1N,17W,SB,K2100,Knapp,3,Any Field,Any Area,Frank Knapp,50,Los Angeles,00,OG,No Pool Breakdown,U,2,,0,040370049400-000-00-00-OG,,,,,,,,,,
1,100000002,000,00,040370116700,4,1N,17W,SB,L2150,Lucky Star,1,Any Field,Any Area,Liu Cheng and Lin,50,Los Angeles,00,OG,No Pool Breakdown,U,2,,0,040370116700-000-00-00-OG,,,,,,,,,,
2,100000003,000,00,040712006500,29,30S,41E,MD,K1855,Kitchens Oil Baron,1,Any Field,Any Area,Charles L. Kitchens,50,San Bernardino,00,OG,No Pool Breakdown,U,2,,0,040712006500-000-00-00-OG,,,,,,,,,,
3,100000004,000,00,040712006000,26,3N,6W,SB,U0515,Federal,2-26,Any Field,Any Area,U. S. Geological Survey,50,San Bernardino,00,OG,No Pool Breakdown,U,2,,0,040712006000-000-00-00-OG,,,,,,,,,,
4,100000005,000,00,040712006100,28,31S,41E,SB,U0515,Kitchen's,101,Any Field,Any Area,U. S. Geological Survey,50,San Bernardino,00,OG,No Pool Breakdown,U,2,,0,040712006100-000-00-00-OG,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277292,100320508,340,00,040306375500,5,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,99,Kern,05,OG,Kern River,U,2,,0,040306375500-340-00-05-OG,,,,,,,,,,
277293,100320509,340,00,040306375500,5,29S,28E,MD,C5640,San Joaquin,216R,Kern River,Any Area,Chevron U.S.A. Inc.,99,Kern,05,SC,Kern River,U,A,,0,040306375500-340-00-05-SC,,,,,,,,,,
277294,100320510,464,00,040296988200,24,11N,23W,SB,A0610,Metson,SWD 4-24,Midway-Sunset,Any Area,Aera Energy LLC,99,Kern,23,WD,Tulare-San Joaquin,U,6,,0,040296988200-464-00-23-WD,0.0,0.0,0.0,570700.0,364.0,0,0,False,False,2017.0
277295,100320511,464,00,040296696000,24,11N,23W,SB,A0610,Metson,SWD 3-24,Midway-Sunset,Any Area,Aera Energy LLC,99,Kern,23,WD,Tulare-San Joaquin,U,6,,0,040296696000-464-00-23-WD,0.0,0.0,0.0,0.0,0.0,0,0,False,False,2017.0


### Add some extra information to the query - this is data that's the same for all wells
in the original data set `REPORT_MO` was used for the Reporting method (50 series = hard copy, 90 series = computerized) - so this will all be 99 now to represent the WellStar data submissions

In [213]:
# add the extra helper columns the destination is expecting
# Add the year of the dataset to all the columns
# This is also used later to pull all the new WELL_ID records
# the item() call converts the pandas or numpy int64 type to basic python int which is needed for MySQL later
#   (MySQL connector doesn't understand pandas or numpy datatypes)
query_year = df['vol_year'].max().item()
df_wells_sql['YEAR_INT'] = query_year
print(f"Query year dataset={query_year}; type={type(query_year)}")

df_wells_sql['report_mo'] = 99
df_wells_sql['api_no_int'] = df_wells_sql['api_no'].astype(int)
progress.value += 1


Query year dataset=2017; type=<class 'int'>


In [214]:
# this is the list of columns to insert into the wells table
wells_cols = list(df_wells_sql.columns)
wells_cols

['PWT__ID',
 'field_cd',
 'area_cd',
 'api_no',
 'loc_section',
 'loc_township',
 'loc_range',
 'loc_bm',
 'operator_cd',
 'lease_name',
 'well_number',
 'field_name',
 'area_name',
 'operator_name',
 'report_type',
 'county',
 'pool_cd',
 'well_type_cd',
 'pool_name',
 'well_status_cd',
 'well_type_no',
 'operator_status_cd',
 'dist_no',
 'api_key',
 'oil_prod_vol_bbl',
 'gas_prod_vol_mcf',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'active_days',
 'last_well_mo',
 'prod_inj_cd',
 'well_status_cng',
 'well_type_cng',
 'YEAR_INT',
 'report_mo',
 'api_no_int']

In [215]:
mv_cols = list(df_sql.columns)
mv_cols

['api_no',
 'api_no_int',
 'prod_inj_date',
 'vol_month',
 'vol_year',
 'well_status_no',
 'well_status',
 'well_type_cd',
 'well_status_cd',
 'casing_psi',
 'tubing_psi',
 'gas_btu',
 'well_mo',
 'oil_api_grav',
 'water_disposition',
 'oil_prod_vol_bbl',
 'prod_days',
 'gas_prod_vol_mcf',
 'water_prod_vol_bbl',
 'gas_inj_vol_mcf',
 'water_inj_vol_bbl',
 'inj_days',
 'surf_inj_press_psi',
 'water_source',
 'water_kind',
 'rep_or_est',
 'rep_or_est_cd',
 'PWT__ID',
 'active_days',
 'api_key']

In [216]:
# After inserting a new well/year row into "wells". use "mycursor.lastrowid" or "connection.insert_id()" to get the Last inserted ID for the monthly_well_volumes table
mysql_dbname = "doggr"
mysql_well_tablename ='wells'
mysql_mv_tablename = 'monthly_well_volume'

# Connect to the mysql server to add the data
# SQLAlchemy uses a database driver to connect, in this case I'll use mysql-connector-python
conn_str = f"mysql+mysqlconnector://pythonuser:pythonuser@localhost/{mysql_dbname}"
engine = create_engine(conn_str, echo=True)
progress.value += 1

### Add the master record to `doggr_file` table
### Run the MYSQL Insert queries for the well records in `df_wells`
This helps track which original data sources each set originated from

In [217]:
query_year

2017

In [218]:
sql = f"INSERT INTO doggr_file (file_name, well_count) VALUES('{mssql_db_name}','{df_wells['api_no'].nunique()}')"

# use `begin()` to start a transaction, which is then used in all the chained queries below.
with engine.begin() as conn:

    try:
        # insert the source file record
        doggr_file_id = conn.execute(text(sql)).lastrowid
        df_wells_sql['doggr_file_id'] = doggr_file_id
        progress.value += 2

        # insert all the well records
        df_wells_sql.to_sql(name=mysql_well_tablename, if_exists='append', index=False, index_label='WELL_ID', chunksize=10000, con=conn )
        progress.value += 5

        result = conn.execute(
            text(f"SELECT well_id, PWT__ID FROM {mysql_well_tablename} WHERE year_int=:year_int"), {"year_int":query_year})

        # Get the well_id list by api_key
        wellid_pwtid_df = pd.DataFrame(result.fetchall(), columns=result.keys())
        progress.value += 5

        # attach the well_id to the monthly volumes
        df_sql = pd.merge( df_sql, wellid_pwtid_df, how='left', on='PWT__ID')

        # insert all the monthly volume data
        df_sql.to_sql(name=mysql_mv_tablename, if_exists='append', index=False, index_label='well_id', chunksize=10000, con=conn )

        conn.commit()
        progress.value += 5

    except Exception as e:
        conn.rollback()
        print(f"SQL error: {e}")
        raise



2025-02-05 12:17:06,833 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-02-05 12:17:06,833 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-05 12:17:06,835 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-02-05 12:17:06,835 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-05 12:17:06,836 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-02-05 12:17:06,837 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-05 12:17:06,838 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-05 12:17:06,840 INFO sqlalchemy.engine.Engine INSERT INTO doggr_file (file_name, well_count) VALUES('WellProductionInjectionLegacy','188879')
2025-02-05 12:17:06,840 INFO sqlalchemy.engine.Engine [generated in 0.00095s] {}
2025-02-05 12:17:07,069 INFO sqlalchemy.engine.Engine DESCRIBE `doggr`.`wells`
2025-02-05 12:17:07,070 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-02-05 12:17:07,889 INFO sqlalchemy.engine.Engine INSERT INTO wells (`PWT__ID`, field_cd, area_cd, api_no, loc_sectio

In [219]:
progress.value = 100.0

In [220]:
wellid_pwtid_df

Unnamed: 0,well_id,PWT__ID
0,13555180,100000001
1,13555181,100000002
2,13555182,100000003
3,13555183,100000004
4,13555184,100000005
...,...,...
277292,13832686,100320508
277293,13832687,100320509
277294,13832688,100320510
277295,13832689,100320511
