# Advanced Data Processing for DCA

This notebook demonstrates advanced data processing techniques for preparing production data for decline curve analysis.

## What You'll Learn
- Clean and filter production data
- Handle multi-well datasets
- Calculate derived metrics (water cut, GOR, days online)
- Detect production anomalies
- Prepare data for forecasting
- Compare multiple forecasting models (Arps, ARIMA, TimesFM)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from decline_curve import dca
from decline_curve.utils import data_processing as dp

# Configure logging
import logging
from decline_curve.logging_config import configure_logging, get_logger

configure_logging(level=logging.INFO)
logger = get_logger(__name__)

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline


## 1. Load and Explore Real Bakken Data

In [None]:
# Load the Bakken well data
df = pd.read_csv('data/bakken_well_production.csv')
df['ReportDate'] = pd.to_datetime(df['ReportDate'])

logger.info("Raw Data Overview:")
logger.info(f"Total records: {len(df)}")
logger.info(f"Columns: {df.columns.tolist()}")
logger.info(f"Data types:{df.dtypes}")
logger.info(f"Missing values:{df.isnull().sum()}")

df.head()

## 2. Data Cleaning and Filtering

In [None]:
# Remove records with zero or null oil production
df_clean = dp.remove_nan_and_zeroes(df, 'Oil')

logger.info(f"Records before cleaning: {len(df)}")
logger.info(f"Records after cleaning: {len(df_clean)}")
logger.info(f"Records removed: {len(df) - len(df_clean)}")

# Check for any remaining data quality issues
logger.info(f"Data Quality Checks:")
logger.info(f"  Negative oil values: {(df_clean['Oil'] < 0).sum()}")
logger.info(f"  Negative water values: {(df_clean['Wtr'] < 0).sum()}")
logger.info(f"  Negative gas values: {(df_clean['Gas'] < 0).sum()}")

## 3. Calculate Derived Metrics

In [None]:
# Calculate days online
df_clean['Online_Date'] = dp.get_grouped_min_max(
    df_clean, 'API_WELLNO', 'ReportDate', 'min'
)
df_clean['Days_Online'] = dp.calculate_days_online(
    df_clean, 'ReportDate', 'Online_Date'
)

# Calculate daily rates
df_clean['oil_rate'] = dp.normalize_production_to_daily(df_clean, 'Oil', 'Days')
df_clean['water_rate'] = dp.normalize_production_to_daily(df_clean, 'Wtr', 'Days')
df_clean['gas_rate'] = dp.normalize_production_to_daily(df_clean, 'Gas', 'Days')

# Calculate water cut and GOR
df_clean['water_cut'] = dp.calculate_water_cut(df_clean, 'Oil', 'Wtr')
df_clean['gor'] = dp.calculate_gor(df_clean, 'Gas', 'Oil')

# Calculate cumulative production
df_clean['cum_oil'] = dp.calculate_cumulative_production(
    df_clean, 'Oil', 'API_WELLNO'
)

logger.info("Derived Metrics Summary:")
logger.info(df_clean[['oil_rate', 'water_cut', 'gor', 'Days_Online', 'cum_oil']].describe().round(2))

df_clean.head()

## 4. Visualize Derived Metrics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Oil rate vs days online
axes[0, 0].scatter(df_clean['Days_Online'], df_clean['oil_rate'], alpha=0.6)
axes[0, 0].set_xlabel('Days Online')
axes[0, 0].set_ylabel('Oil Rate (bbl/day)')
axes[0, 0].set_title('Oil Rate Decline')
axes[0, 0].grid(True, alpha=0.3)

# Water cut over time
axes[0, 1].plot(df_clean['ReportDate'], df_clean['water_cut'], 'o-', markersize=4)
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Water Cut (%)')
axes[0, 1].set_title('Water Cut Trend')
axes[0, 1].grid(True, alpha=0.3)

# GOR over time
axes[1, 0].plot(df_clean['ReportDate'], df_clean['gor'], 'o-', markersize=4, color='red')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('GOR (mcf/bbl)')
axes[1, 0].set_title('Gas-Oil Ratio')
axes[1, 0].grid(True, alpha=0.3)

# Cumulative oil
axes[1, 1].plot(df_clean['ReportDate'], df_clean['cum_oil'], '-', linewidth=2, color='green')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Cumulative Oil (bbl)')
axes[1, 1].set_title('Cumulative Production')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Detect Production Anomalies

In [None]:
# Prepare time series
oil_series = df_clean.set_index('ReportDate')['Oil']

# Detect anomalies
anomalies = dp.detect_production_anomalies(oil_series, threshold_std=2.5)

logger.info(f"Anomaly Detection Results:")
logger.info(f"  Total records: {len(oil_series)}")
logger.info(f"  Anomalies detected: {anomalies.sum()}")
logger.info(f"  Anomaly rate: {anomalies.sum() / len(oil_series) * 100:.1f}%")

if anomalies.sum() > 0:
    logger.info(f"Anomalous Production Values:")
    logger.info(oil_series[anomalies])

# Visualize anomalies
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(oil_series.index, oil_series.values, 'o-', label='Production', markersize=4)
ax.plot(oil_series[anomalies].index, oil_series[anomalies].values, 'ro', 
        label='Anomalies', markersize=8, zorder=5)
ax.set_xlabel('Date')
ax.set_ylabel('Oil Production (bbl/month)')
ax.set_title('Production with Anomaly Detection')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Prepare Data for DCA (Convenience Function)

In [None]:
# Get well ID from the data
well_id = df_clean['API_WELLNO'].iloc[0]

# Use convenience function to prepare data
oil_series_clean = dp.prepare_well_data_for_dca(
    df,
    well_id=well_id,
    well_column='API_WELLNO',
    date_column='ReportDate',
    production_column='Oil',
    remove_zeros=True
)

logger.info("Prepared Time Series for DCA:")
logger.info(f"  Well ID: {well_id}")
logger.info(f"  Data points: {len(oil_series_clean)}")
logger.info(f"  Date range: {oil_series_clean.index[0]} to {oil_series_clean.index[-1]}")
logger.info(f"  Total production: {oil_series_clean.sum():,.0f} bbl")
logger.info(f"First few values:")
logger.info(oil_series_clean.head())

## 7. Get Maximum Initial Production (qi)

In [None]:
# Calculate qi from first 3 months (handles ramp-up)
qi = dp.get_max_initial_production(
    df_clean,
    n_months=3,
    production_column='Oil',
    date_column='ReportDate'
)

logger.info(f"Initial Production Rate (qi):")
logger.info(f"  Maximum in first 3 months: {qi:,.0f} bbl/month")
logger.info(f"  First month production: {df_clean['Oil'].iloc[0]:,.0f} bbl/month")
logger.info(f"Using qi = {qi:,.0f} bbl/month for decline curve fitting")

# Show first 3 months
logger.info(f"First 3 months production:")
logger.info(df_clean[['ReportDate', 'Oil']].head(3))

## 8. Run DCA with Prepared Data

In [None]:
# Generate forecast using prepared data
forecast = dca.forecast(
    oil_series_clean,
    model='arps',
    kind='hyperbolic',
    horizon=24
)

# Evaluate
metrics = dca.evaluate(oil_series_clean, forecast)

logger.info("Forecast Results:")
logger.info(f"  RMSE: {metrics['rmse']:.0f} bbl/month")
logger.info(f"  MAE: {metrics['mae']:.0f} bbl/month")
logger.info(f"  SMAPE: {metrics['smape']:.1f}%")

# Plot
dca.plot(oil_series_clean, forecast, 
         title='Bakken Well Forecast (Processed Data)',
         filename='bakken_forecast_processed.png')

## Summary

In this notebook, we demonstrated:
1. ✓ Loading and exploring raw production data
2. ✓ Cleaning and filtering data
3. ✓ Calculating derived metrics (water cut, GOR, days online)
4. ✓ Detecting production anomalies
5. ✓ Using convenience functions to prepare data for DCA
6. ✓ Calculating initial production rate (qi)
7. ✓ Running decline curve analysis
8. ✓ Compare multiple forecasting models

## Key Takeaways

- **Data quality matters**: Always clean and validate production data
- **Derived metrics**: Water cut and GOR provide insights into well performance
- **Anomaly detection**: Identify operational issues or data errors
- **Convenience functions**: Use `prepare_well_data_for_dca()` for quick setup
- **Multiple models**: Compare Arps, ARIMA, and TimesFM for best results

## Next Steps

- Apply these techniques to your own production data
- Experiment with different anomaly detection thresholds
- Try ARIMA or TimesFM for complex patterns
- Batch process multiple wells using these utilities