# Best Place for a New Well for OilyGiant

## Introduction

### Prepare the Data

In [10]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [11]:
# Load the data
data_0 = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/geo_data_0.csv')
data_1 = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/geo_data_1.csv')
data_2 = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/geo_data_2.csv')

In [12]:
# Explore the data
data_0.info()
display(data_0.head())
data_1.info()
display(data_1.head())
data_2.info()
display(data_2.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


Unnamed: 0,id,f0,f1,f2,product
0,kBEdx,-15.001348,-8.276,-0.005876,3.179103
1,62mP7,14.272088,-3.475083,0.999183,26.953261
2,vyE1P,6.263187,-5.948386,5.00116,134.766305
3,KcrkZ,-13.081196,-11.506057,4.999415,137.945408
4,AHL4O,12.702195,-8.147433,5.004363,134.766305


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.87191
3,q6cA6,2.23606,-0.55376,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746


In [13]:
print(data_0.duplicated().sum())
print(data_1.duplicated().sum())
print(data_2.duplicated().sum())

0
0
0


In [14]:
# Define variables
datasets = [data_0, data_1, data_2]

# Define constants
BUDGET = 100_000_000  # Budget for development of 200 oil wells is 100 USD million
REVENUE_PER_UNIT = 4_500  # One barrel of raw materials brings 4.5 USD of revenue
WELLS = 200  # The number of wells to choose

In [15]:
# Defina a function to calculate profit
# Define the function to calculate profit
def calculate_profit(target, predictions, count):
    probs_sorted = predictions.sort_values(ascending=False)
    selected = target[probs_sorted.index][:count]
    return REVENUE_PER_UNIT * selected.sum() - BUDGET

### Train and Test the Model

#### Split the data into a training set and a validation set

In [16]:
# train_0, valid_0 = train_test_split(data_0, test_size=0.25, random_state=12345)
# train_1, valid_1 = train_test_split(data_1, test_size=0.25, random_state=12345)
# train_2, valid_2 = train_test_split(data_2, test_size=0.25, random_state=12345)

In [23]:
# Process each dataset
for i, data in enumerate(datasets):
    # Drop the 'id' column as it messes with the model
    data = data.drop(['id'], axis=1)

    # Split the data into a training set and validation set
    train, valid = train_test_split(data, test_size=0.25, random_state=12345)

    # Reset the indices of the validation set
    valid = valid.reset_index(drop=True)

    # Train the model and make predictions for the validation set
    model = LinearRegression()
    model.fit(train.drop(['product'], axis=1), train['product'])
    predictions = model.predict(valid.drop(['product'], axis=1))

    # Convert predictions to a pandas Series
    predictions = pd.Series(predictions, index=valid.index)

    # Calculate the RMSE and the average volume of predicted reserves
    rmse = np.sqrt(mean_squared_error(valid['product'], predictions))
    average_volume = predictions.mean()

    print("Region: ", i)
    print("Average volume of predicted reserves: ", average_volume)
    print("RMSE: ", rmse)

    # Calculate the profit for the region
    profit = calculate_profit(valid['product'], pd.Series(predictions), WELLS)

    # Print the profit for the region
    print(f"Profit for region {i}: {profit}")

    # Bootstrapping
    state = np.random.RandomState(12345)
    values = []
    for i in range(1000):
        target_subsample = valid['product'].sample(n=500, replace=True, random_state=state)
        probs_subsample = predictions[target_subsample.index]
        values.append(calculate_profit(target_subsample, probs_subsample, WELLS))

    values = pd.Series(values)
    lower = values.quantile(0.025)
    mean = values.mean()
    upper = values.quantile(0.975)
    risk_of_loss = (values < 0).mean()

    print("Average profit: ", mean)
    print("2.5% quantile: ", lower)
    print("97.5% quantile: ", upper)
    print("Risk of loss: ", risk_of_loss)
    print('-------------------------')

Region:  0
Average volume of predicted reserves:  92.59256778438035
RMSE:  37.5794217150813
Profit for region 0: 33208260.43139851
Average profit:  4259385.269105923
2.5% quantile:  -1020900.9483793724
97.5% quantile:  9479763.533583675
Risk of loss:  0.06
-------------------------
Region:  1
Average volume of predicted reserves:  68.728546895446
RMSE:  0.8930992867756167
Profit for region 1: 24150866.966815114
Average profit:  5152227.734432898
2.5% quantile:  688732.2537050088
97.5% quantile:  9315475.912570495
Risk of loss:  0.01
-------------------------
Region:  2
Average volume of predicted reserves:  94.96504596800489
RMSE:  40.02970873393434
Profit for region 2: 27103499.635998324
Average profit:  4350083.627827557
2.5% quantile:  -1288805.473297878
97.5% quantile:  9697069.541802654
Risk of loss:  0.064
-------------------------
