In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv("synthetic_permian_wells_with_realistic_cost.csv")
df.head()

Unnamed: 0,api_number,operator,lateral_length_ft,total_depth_ft,porosity,permeability_mD,net_pay_ft,TOC,vshale,drill_cost_usd,...,month_55,month_56,month_57,month_58,month_59,month_60,cum_oil_12mo,drill_decision,drill_cost_usd_new,log_cost
0,0,OperatorC,11340.686945,14349.990327,0.080152,0.551225,54.930159,2.501479,0.336377,6311718.0,...,256.180818,251.5527,247.072775,242.734286,238.530877,234.456558,15218.450558,0,9326363.0,16.048356
1,1,OperatorC,6005.832985,13858.826161,0.066154,5.025962,36.430475,2.391077,0.138618,6584116.0,...,392.357452,388.511591,384.758525,381.094722,377.516833,374.021682,11254.942027,0,6878199.0,15.743867
2,2,OperatorC,8515.333291,13766.635332,0.105921,0.553824,52.517608,2.886481,0.256975,5319743.0,...,838.660309,833.017245,827.446817,821.947648,816.518398,811.157757,14902.974391,0,8572776.0,15.964102
3,3,OperatorB,7101.891433,13821.045026,0.084042,0.43746,60.799513,1.828203,0.131822,6758489.0,...,360.813012,355.084243,349.523852,344.124676,338.879949,333.783274,16693.082216,0,9035522.0,16.016674
4,4,OperatorB,7779.319247,12990.945084,0.111383,0.69614,73.263359,1.546026,0.227605,7489518.0,...,1143.017218,1131.806628,1120.806259,1110.010323,1099.413241,1089.009634,26006.400929,1,7586286.0,15.841853


In [15]:
df.columns

Index(['api_number', 'operator', 'lateral_length_ft', 'total_depth_ft',
       'porosity', 'permeability_mD', 'net_pay_ft', 'TOC', 'vshale',
       'drill_cost_usd', 'completion_cost_usd', 'opex_usd', 'q0', 'b', 'd',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'month_13', 'month_14', 'month_15', 'month_16', 'month_17', 'month_18',
       'month_19', 'month_20', 'month_21', 'month_22', 'month_23', 'month_24',
       'month_25', 'month_26', 'month_27', 'month_28', 'month_29', 'month_30',
       'month_31', 'month_32', 'month_33', 'month_34', 'month_35', 'month_36',
       'month_37', 'month_38', 'month_39', 'month_40', 'month_41', 'month_42',
       'month_43', 'month_44', 'month_45', 'month_46', 'month_47', 'month_48',
       'month_49', 'month_50', 'month_51', 'month_52', 'month_53', 'month_54',
       'month_55', 'month_56', 'month_57', 'month_58', 'month_59', 'month_60',
  

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# validate target and features
target_col = "drill_cost_usd_new"
missing = [f for f in features if f not in df.columns]
if missing:
    raise KeyError(f"Missing features in df: {missing}")

# prepare X, drop identifier and encode categorical
X = df[features].copy()
if 'api_number' in X.columns:
    X = X.drop(columns=['api_number'])          # id column - drop
if 'operator' in X.columns:
    X = pd.get_dummies(X, columns=['operator'], drop_first=True)

y = df[target_col].copy()

# split, train, evaluate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


print("Train R2:", r2_score(y_train, y_pred_train))
print("Test  R2:", r2_score(y_test, y_pred_test))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

# optional: show top feature importances
feat_imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop features:\n", feat_imp.head(20))




Train R2: 0.9099681133941322
Test  R2: 0.38758359673897846
Test MAE: 746593.1839747502
Test RMSE: 947635.181438134

Top features:
 lateral_length_ft      0.312848
completion_cost_usd    0.154661
total_depth_ft         0.074414
porosity               0.038794
b                      0.038236
net_pay_ft             0.037645
drill_cost_usd         0.035380
permeability_mD        0.035134
opex_usd               0.034551
vshale                 0.033629
TOC                    0.032244
d                      0.025231
q0                     0.015141
month_1                0.008624
month_2                0.006911
month_3                0.005769
month_4                0.004728
month_5                0.004161
month_8                0.003899
month_6                0.003364
dtype: float64


In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Example input features for EUR prediction
# Features: [lateral_length (m), force_state (kN), temperature (°C)]
X_new = np.array([[2.5, 150, 23]])

# Placeholder trained model
model = LinearRegression()
model.coef_ = np.array([0.4, 0.002, 0.01])
model.intercept_ = 0.8

# Make prediction
eur_prediction = model.predict(X_new)
print("Predicted EUR value:", eur_prediction[0])


Predicted EUR value: 2.33
