# Python for Drilling Engineers - Module 3
## 1. 🔁 Recap & Today's Agenda
Let's quickly review what we covered in Lesson 2:
- Sorting, filtering, basic dataframe operations


## Today's Objectives
- DataFrames Continued...
  - Grouping & Aggregation
  - KPI Calculations with Groups
  - Slicing with `.loc[]`
- Data QA/QC Techniques
  - Use .isna(), .duplicated() for checks
  - Define your own thresholds (ex. ROP > 5000 fph = likely error)
  - Remove null placeholders from datasets (ex. -999.99)
  - IQR Outlier Removal
- Pandas Profiling Library/Reports
- **Bonus**: ML Concepts Overview

...but first, let's load our dataset.



In [None]:
import pandas as pd
run_number = [4, 5, 6, 7, 10, 11, 13, 15, 16, 17, 18, 23, 26, 27, 28, 30, 31, 32, 33]
start_time = ["10/30/2020 3:20", "11/4/2020 8:49", "11/7/2020 16:38", "11/9/2020 22:41", "11/14/2020 11:43",
              "11/15/2020 2:46", "11/20/2020 0:20", "11/24/2020 2:46", "11/25/2020 23:21", "11/26/2020 22:57",
              "11/28/2020 14:49", "12/3/2020 23:23", "12/7/2020 3:30", "12/8/2020 20:34", "12/9/2020 13:17",
              "12/12/2020 12:35", "12/13/2020 11:44", "12/17/2020 0:21", "12/18/2020 10:01"]
end_time = ["10/31/2020 4:27", "11/7/2020 6:25", "11/8/2020 22:54", "11/10/2020 14:58", "11/14/2020 16:20",
            "11/16/2020 13:28", "11/22/2020 6:35", "11/24/2020 18:30", "11/26/2020 10:40", "11/27/2020 19:57",
            "11/29/2020 9:39", "12/5/2020 6:59", "12/7/2020 23:55", "12/9/2020 0:21", "12/9/2020 15:31",
            "12/13/2020 0:54", "12/14/2020 8:19", "12/17/2020 18:38", "12/18/2020 23:58"]
run_duration = [25.11666667, 69.59722222, 30.26666667, 16.28333333, 4.616666667, 34.70277778, 54.25, 15.73333333,
                11.31666667, 21, 18.83333333, 31.6, 20.41666667, 3.783333333, 2.233333333, 12.31666667, 20.58333333,
                18.28333333, 13.95]
start_depth = [120.95001, 1629.09, 4552, 4964.7676, 5112, 5112.0776, 5505.0513, 5892.058, 6360.5713, 6527.22,
               6945.0454, 7389, 8024.0015, 8242.251, 8392.4375, 8535.091, 9064.573, 9747.119, 10490.042]
end_depth = [1629.0634, 4556.19, 4964.3687, 5113.364, 5379.8945, 5472.668, 5855.826, 6360.453, 6526.268, 6944.9404,
             7394.7295, 8024.3887, 8241.282, 8391.413, 8540.855, 9064.383, 9747.942, 10490.022, 10960.597]
run_length = [1508.11339, 2927.1, 412.3687, 148.5964, 267.8945, 360.5904, 350.7747, 468.395, 165.6967, 417.7204,
              449.6841, 635.3887, 217.2805, 149.162, 148.4175, 529.292, 683.369, 742.903, 470.555]
bit_make = ["NOV", "NOV", "Smith", "Smith", "Smith", "Ulterra", None, "NOV", "NOV", "NOV", "NOV", "NOV",
            "NOV", "NOV", "NOV", "NOV", "NOV", "NOV", "NOV"]
bit_model = ["TKC76", "TKC66", "MDSi616", "Z713S", "XS616", "U616M", None, "TKC63", "SKC613M", "SKC513M",
             "FTKC63-01", "TKC63", "SKC513M", "SKC613M", "SKC613M", "TKC63", "FTKC63-01", "TKC63", "TKC63"]
bit_od = [17.5, 12.25, 12.25, 12.25, 12.25, 12.25, None, 8.75, 8.75, 8.75, 8.75, 8.75, 8.75, 8.75, 8.75, 8.75,
          8.75, 8.75, 8.75]
motor = [False, True, True, True, True, True, None, True, True, True, True, True, True, True, True, True, True, True, True]
motor_make = [None, "Scout", "Scout", "Scout", "Scout", "Scout", None, "Scout", "Scout", "Scout", "Scout", "Scout",
              "Scout", "Scout", "Scout", "Scout", "Scout", "Scout", "Scout"]
motor_od = [None, 9.625, 9.625, 9.625, 9.625, 9.625, None, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5]
motor_config = [None, "7/8-5.9", "7/8-5.9", "7/8-5.9", "7/8-5.9", "7/8-3.0", None, "7/8-5.7", "7/8-5.7", "7/8-5.7",
                "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7", "7/8-5.7"]
rss = [True, True, True, True, True, True, None, False, False, False, False, False, False, False, False, False,
       False, False, False]
rss_make = ["Scout Vertical", "Scout Vertical", "Scout Vertical", "Scout Vertical", "Scout Vertical", "Scout Vertical",
            None, None, None, None, None, None, None, None, None, None, None, None, None]

# Create a new DataFrame with the provided data
bit_run_dict = {
    'run_number': run_number,
    'start_time': start_time,
    'end_time': end_time,
    'run_duration': run_duration,
    'start_depth': start_depth,
    'end_depth': end_depth,
    'run_length': run_length,
    'bit_make': bit_make,
    'bit_model': bit_model,
    'bit_od': bit_od,
    'motor': motor,
    'motor_make': motor_make,
    'motor_od': motor_od,
    'motor_config': motor_config,
    'rss': rss,
    'rss_make': rss_make
}
bit_run_df = pd.DataFrame(bit_run_dict)
bit_run_df['avg_rop'] = bit_run_df['run_length'] / bit_run_df['run_duration']

## 2 🔹 Grouping with `.groupby()`

Grouping allows us to split our data into segments based on a column value, then perform calculations on each group independently.

### 🔧 Syntax
```python
df.groupby("column_name")


In [None]:
bit_run_df.groupby('bit_make').size()

In [None]:
bit_make_counts = bit_run_df.groupby('bit_make').size().reset_index(name='count')
bit_make_counts

In [None]:
bit_runs_grouped = bit_run_df.groupby(['bit_od', 'bit_make']).size().reset_index(name='count')
bit_runs_grouped

## 3 📊 Aggregating Multiple Metrics with `.agg()`

Aggregation is how we apply multiple statistical functions to grouped data. This is where `.agg()` shines.

### 🔧 Syntax
```python
df.groupby("bit_model").agg({
    "rop": "mean",
    "torque": "max",
    "wob": "std"
})
```

### 📌 Why It Matters
You can quickly create summary reports that show how different combinations of equipment perform:
- Which bit models produce the highest ROP?
- Which motor configurations generate the most torque?
- Where is the variability the highest?

Hands-on: Use `.agg()` to summarize 2-3 performance metrics for a key grouping (e.g., `run_number`, `bit_make`, or `interval_type`)


In [None]:
bit_runs_grouped = bit_run_df.groupby(['bit_od', 'bit_make']).agg(
    count=('run_number', 'size'),
    avg_run_duration=('run_duration', 'mean'),
    avg_run_length=('run_length', 'mean'),
).reset_index()
bit_runs_grouped

**Now you try**

Group by bit_od and bit_make. Then calculate avg_rop for each group.

In [None]:
# Type your code here

## 4 🔍 Slicing Data with `.loc[]`

`.loc[]` allows us to filter rows and select specific columns, all in one go.

### 🔧 Syntax
```python
df.loc[condition, ["column1", "column2"]]
```

### 📌 Example
Filter out runs with large bits:
```python
df.loc[df["bit_od"] > 8.5, ["bit_model", "avg_rop", "run_duration", "run_length"]]
```

You can also combine multiple conditions using `&` (and) or `|` (or):
```python
df.loc[(df["motor"] == True) & (df["formation"] == "Wolfcamp")]
```

Use `.loc[]` when you want precision and control while filtering or subsetting your dataset.


In [None]:
screen = (bit_run_df.bit_od == 8.75)
bit_run_df.loc[screen, 'bit_make']

In [None]:
screen = (bit_run_df.bit_od == 8.75)
bit_run_df.loc[screen, 'bit_make'] = 'National Oilwell Varco'
bit_run_df

## 4 🧪 Data QA/QC: Catching Dirty Data Before It Catches You

Before diving into analysis or modeling, you must understand the quality of your data.

### ⚠️ Common Issues
- Missing values (NaNs)
- Duplicate rows
- Out-of-range or impossible values (e.g., RPM > 1000)

### 🔧 Tools for QA/QC
```python
df.isna().sum()             # Count missing values
df.duplicated().sum()       # Count duplicates
df.describe()               # Quick sanity check on ranges
IQR Outlier Removal         # Remove outliers with inter-quartile-range method
```

### 🧠 Drilling-Specific QC Ideas
- Is depth ever negative?
- Are there values beyond what’s physically possible?
- Does every run have a start and end time?

Hands-on: Write a few simple checks to flag suspect rows. For example:
```python
df[df["rpm"] > 500]
```

Think of this as debugging your dataset before launching your analysis.


Import on_btm_df locally.

In [None]:
import pandas as pd
# Load the on_btm_df from a CSV file
file_name = 'on_btm_df.csv'
# get current directory
import os
current_directory = os.getcwd()
file_path = os.path.join(current_directory, file_name)
print(file_path)
on_btm_df = pd.read_csv(file_path)

from Google Drive

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

file_name = 'on_btm_df.csv'  # Replace with your file name once uploaded to Google Drive

file_path = f'/content/drive/My Drive/python-for-drilling-engineers/module_2/{file_name}'

on_btm_df = pd.read_csv(file_path)

### Missing Data, Duplicate Data, Quick Checks
```python
df.info()                   # Quick column check and null counts
df.isna().sum()             # Count missing values
df.duplicated().sum()       # Count duplicates
df.describe()               # Quick sanity check on ranges
```

In [None]:
on_btm_df.info()

In [None]:
on_btm_df.isna().sum()

In [None]:
on_btm_df.duplicated().sum()

In [None]:
on_btm_df.describe()

#### Removing Null Values and -999's

In [None]:
# Replace Null Values with 0 in the 'rop' and 'wob' columns
columns_to_replace = ['rop', 'wob', 'diff_press', 'td_rpm', 'td_torque']
on_btm_df['columns_to_replace'].fillna(0, inplace=True)
on_btm_df['wob'].fillna(0, inplace=True)
# Verify that the null values have been replaced
on_btm_df.info()  # Check the DataFrame info again to ensure no nulls in 'rop' and 'wob'

In [None]:
# Replace -999 values with None in the 'rop' and 'wob' columns
on_btm_df.loc[on_btm_df['rop'] == -999.99, 'rop'] = None
on_btm_df.loc[on_btm_df['wob'] == -999.99, 'wob'] = None

# Alternative method using replace
# on_btm_df['rop'].replace(-999.99, None, inplace=True)
# on_btm_df['wob'].replace(-999.99, None, inplace=True)

# Verify that the -999 values have been replaced
on_btm_df.describe()

###  Remove Outliers with IQR Method

First, let's filter down the dataframe to just the runs we want to look at (18, 23, 26) and rotating only data.

In [None]:
run_analysis_df = on_btm_df.copy()  # Create a copy of the DataFrame for analysis
param_name_list = ['rop', 'wob', 'td_rpm', 'td_torque', 'diff_press']
rpm_rotating_thresh = 40  # RPM threshold for rotating
torque_rotating_thresh = 5000  # Torque threshold for rotating
run_number_list = [18, 23, 26]  # List of run numbers to analyze

run_screen = (on_btm_df['run_number'].isin(run_number_list))  # Filter the DataFrame to only include the run numbers in the list
rotating_screen = (on_btm_df['td_rpm'] > rpm_rotating_thresh) & (on_btm_df['td_torque'] > torque_rotating_thresh)
run_analysis_df = on_btm_df[run_screen & rotating_screen]

print("\n\n Filtered DataFrame before Outlier Removal:")
run_analysis_df.describe()  # Display the statistics of the filtered DataFrame before outlier removal

In [None]:
def remove_outliers_per_column(df, column_names):
    # Fill NaNs once for all selected columns
    df[column_names] = df[column_names].fillna(0)
    
    for col in column_names:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

# Group by run_number and apply the outlier removal
all_run_df = (
    run_analysis_df[run_analysis_df['run_number'].isin(run_number_list)]
    .groupby('run_number', group_keys=False)
    .apply(lambda df: remove_outliers_per_column(df.copy(), param_name_list))
)

print("\n\n Filtered DataFrame after Outlier Removal:")
all_run_df.describe()  # Display the statistics of the filtered DataFrame after outlier removal

In [None]:
all_run_df.to_csv('all_run_df.csv', index=False)

#### Pandas Profiling Library

In [None]:
# If running on GoogleColab, you must pip install ydata-profiling before running the next cell
!pip install ydata-profiling

# Generate a profile report
from ydata_profiling import ProfileReport
profile = ProfileReport(on_btm_df, title="Forge 16A Data Analysis", explorative=True)
profile.to_notebook_iframe()
# Save the profile report to an HTML file
profile.to_file(output_file="forge_16A_on_btm_report.html")