In [14]:
import camelot
import pandas as pd
# Path to the PDF file
pdf_path = "https://www.dmr.nd.gov/oilgas/stats/historicaloilprodstats.pdf"

# Extract tables from all pages
tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')

combine_tables = pd.concat([table.df for table in tables])
header = ["Year", "Month", "BBLS Oil", "Daily Oil", "Wells Producing", "BBLS Per Well", "Daily Oil Per Well"]
combine_tables.rename(columns={col: header[i] for i, col in enumerate(combine_tables.columns)}, inplace = True)
combine_tables["Date"] = pd.to_datetime(combine_tables["Year"].astype(str) + "-" + combine_tables["Month"].astype(str) + "-01")
del combine_tables["Year"]
del combine_tables["Month"]
for key, col in combine_tables.items():
    if key in ["Date"]:
        continue
    combine_tables[key] = pd.to_numeric(combine_tables[key], errors='coerce')

In [18]:
combine_tables

Unnamed: 0,BBLS Oil,Daily Oil,Wells Producing,BBLS Per Well,Daily Oil Per Well,Date
0,3092,103,1,3092,103,1951-04-01
1,2350,76,1,2350,76,1951-07-01
2,6259,202,1,6259,202,1951-08-01
3,3508,117,1,3508,117,1951-09-01
4,4129,133,1,4129,133,1951-10-01
...,...,...,...,...,...,...
31,36428455,1256154,18347,1986,68,2024-02-01
32,38211822,1232639,18456,2070,67,2024-03-01
33,37324157,1244139,18561,2011,67,2024-04-01
34,37140657,1198086,18599,1997,64,2024-05-01


In [20]:
mean_tables = combine_tables.set_index("Date").resample("Y").mean()
annual = combine_tables.set_index("Date").resample("Y").sum()
annual["Daily Oil"] = mean_tables["Daily Oil"]
annual["Wells Producing"] = mean_tables["Wells Producing"]
annual["BBLS Per Well"] = annual["BBLS Oil"] / annual["Wells Producing"]
annual["Daily Oil Per Well"] = annual["Daily Oil"] / annual["Wells Producing"]
annual

Unnamed: 0_level_0,BBLS Oil,Daily Oil,Wells Producing,BBLS Per Well,Daily Oil Per Well
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1951-12-31,26196,1.220000e+02,1.000000,26196.000000,122.000000
1952-12-31,1603555,4.369333e+03,34.416667,46592.397094,126.953995
1953-12-31,5275831,1.447767e+04,184.333333,28621.144665,78.540687
1954-12-31,6024947,1.649000e+04,350.583333,17185.491799,47.035893
1955-12-31,11183607,3.064517e+04,525.666667,21275.092581,58.297717
...,...,...,...,...,...
2020-12-31,438683034,1.199372e+06,14711.833333,29818.379808,81.524328
2021-12-31,409104098,1.120669e+06,16306.750000,25088.021709,68.724214
2022-12-31,390326798,1.069470e+06,16881.083333,23122.141529,63.353132
2023-12-31,435294884,1.192620e+06,17697.916667,24595.826288,67.387597


In [23]:
import plotly.express as px
plot_df = annual.loc["2005":]
fig = px.line(plot_df, x=plot_df.index, y="Daily Oil", title="Daily Oil Production in North Dakota")
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

