In [1]:
import pandas as pd

df = pd.read_csv('_train.csv', sep=";")
df.fillna(-999.25, inplace=True)

In [2]:
#Renaming the lithology and confidence to something more understandable
train = df.rename(columns={'FORCE_2020_LITHOFACIES_LITHOLOGY':'FACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE':'FACIES_CONFIDENCE'})

#Checking if column name successfully changed
print(train.columns)

#Checking how many unique target value 
print("\nNumber of Lithology: " + str(len(train.FACIES_LITHOLOGY.unique())))

print("\nNumber of Wells: "+ str(len(train.WELL.unique())))

print("\nNumber of Features: {}".format(len(train.columns)))

for column in train.columns:
  nullvalue=train[column].isna().sum()
  print("Nullvalue in {} : {}".format(column, nullvalue),)



Index(['WELL', 'DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'GROUP', 'FORMATION',
       'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO', 'FACIES_LITHOLOGY', 'FACIES_CONFIDENCE'],
      dtype='object')

Number of Lithology: 12

Number of Wells: 98

Number of Features: 29
Nullvalue in WELL : 0
Nullvalue in DEPTH_MD : 0
Nullvalue in X_LOC : 0
Nullvalue in Y_LOC : 0
Nullvalue in Z_LOC : 0
Nullvalue in GROUP : 0
Nullvalue in FORMATION : 0
Nullvalue in CALI : 0
Nullvalue in RSHA : 0
Nullvalue in RMED : 0
Nullvalue in RDEP : 0
Nullvalue in RHOB : 0
Nullvalue in GR : 0
Nullvalue in SGR : 0
Nullvalue in NPHI : 0
Nullvalue in PEF : 0
Nullvalue in DTC : 0
Nullvalue in SP : 0
Nullvalue in BS : 0
Nullvalue in ROP : 0
Nullvalue in DTS : 0
Nullvalue in DCAL : 0
Nullvalue in DRHO : 0
Nullvalue in MUDWEIGHT : 0
Nullvalue in RMIC : 0
Nullvalue in ROPA : 0
Nullvalue in RXO : 0
Nullvalue in FAC

In [None]:
!pip install cegal-welltools

In [None]:
# The plot from cegaltools to show data coverage is definitely useful in terms of thinking how to deal with missing log values. 
# We need to investigate and deal with missing values before proceeding with Facies prediction
from cegal.welltools.plotting import CegalWellPlotter as cwp

In [None]:
cwp.plot_coverage(df=train)

In [None]:
# Creating well plot given input. This was one using cegaltools library
def show_well_curve(well_name):

  #well = int(position)
    well_name=well_name
    df = train.loc[train.WELL == well_name].set_index('DEPTH_MD')
    cwp.plot_logs(df=df, 
              logs=['GROUP','FORMATION', 'RHOB', 'GR', 'NPHI', 'DTC', 'DTS'], 
              log_scale_logs=['RMED', 'RDEP'],
              lithology_logs='FACIES_LITHOLOGY', 
              lithology_proba_logs='FACIES_CONFIDENCE')
    

In [None]:
import ipywidgets as widgets
widgets.interact(show_well_curve, well_name=train.WELL.unique())

In [3]:
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

train['FACIES_LITHOLOGY']=train['FACIES_LITHOLOGY'].map(lithology_numbers)

In [4]:
formation = {-999.25:0, 'Utsira Fm.':1, 'Balder Fm.':2, 'Sele Fm.':3, 'Lista Fm.':4, 'Heimdal Fm.':5,
 'Tor Fm.':6, 'Hod Fm.':7, 'Blodoeks Fm.':8, 'Svarte Fm.':9, 'Roedby Fm.':10, 'Sola Fm.':11,
 'Aasgard Fm.':12, 'Draupne Fm.':13, 'Heather Fm.':14, 'Hugin Fm.':15, 'Smith Bank Fm.':16,
 'Frigg Fm.':17, 'Skagerrak Fm.':18, 'Ekofisk Fm.':19, 'Kupferschiefer Fm.':20,
 'Skade Fm.':21, 'Grid Fm.':22, 'Vaale Fm.':23, 'Sleipner Fm.':24, 'Hidra Fm.':25, 'Tuxen Fm.':26,
 'Mandal Fm.':27, 'Ula Fm.':28, 'Bryne Fm.':29, 'Tau Fm.':30, 'Sandnes Fm.':31,
 'Intra Draupne Fm. Sst.':32, 'Statfjord Fm.':33, 'Skade Mb.':34, 'BASEMENT':35,
 'Ran Sst Mb.':36, 'Flekkefjord Fm.':37, 'Sauda Fm.':38, 'Egersund Fm.':39,
 'Intra Balder Fm. Sst.':40, 'Hermod Mb.':41, 'Ty Fm.':42, 'Hardraade Fm.':43, 'Kyrre Fm.':44,
 'Tryggvason Fm.':45, 'Drake Fm.':46, 'Cook Fm.':47, 'Amundsen Fm.':48, 'Grid Mb.':49,
 'Ty Mb.':50, 'Jorsalfare Fm.':51, 'Burton Fm.':52, 'Mime Fm.':53,
 'Intra Heather Fm. Sst.':54, 'Tarbert Fm.':55, 'Ness Fm.':56, 'Etive Fm.':57,
 'Rannoch Fm.':58, 'Broom Fm.':59, 'Lunde Fm.':60, 'Oseberg Fm.':61, 'Sognefjord Fm.':62,
 'Fensfjord Fm.':63, 'Krossfjord Fm.':64, 'Johansen Fm.':65, 'Eiriksson Mb.':66,
 'Raude Mb.':67, 'Agat Fm.':68, 'Farsund Fm.':69}

train['FORMATION']=train['FORMATION'].map(formation)

In [6]:
Group = {'NORDLAND GP.':0, 'HORDALAND GP.':1, 'ROGALAND GP.':2, 'SHETLAND GP.':3,
       'CROMER KNOLL GP.':4, 'VIKING GP.':5, 'VESTLAND GP.':6, 'ZECHSTEIN GP.':7,
       'HEGRE GP.':8, 'ROTLIEGENDES GP.':9, 'TYNE GP.':10, 'BOKNFJORD GP.':11,
       'DUNLIN GP.':12, 'BAAT GP.':13, -999.25:14}

train['GROUP']=train['GROUP'].map(Group)

train=train[train.GROUP != 14]

In [8]:
train.GROUP.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
      dtype=int64)

In [9]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Load data
#df_train = train

# Data preprocessing
X = train.drop(['DEPTH_MD', 
                           'X_LOC', 
                           'Y_LOC',
                           'Z_LOC',
                           'GROUP',
                           'FORMATION',
                           'SGR',
                           'BS',
                           'ROP',
                           'DTS',
                           'DCAL',
                           'MUDWEIGHT','RMIC','ROPA','RXO'],axis=1)
y = train['GROUP']
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
# X_train = torch.FloatTensor(X_train)
# X_test = torch.FloatTensor(X_test)
# y_train = torch.LongTensor(y_train.values)
# y_test = torch.LongTensor(y_test.values)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


print('Preparing Model...........')
rfcl = RandomForestClassifier(n_estimators = 300, random_state=1,max_features=12, max_depth=7, min_samples_leaf = 15)
rfcl = rfcl.fit(X_train, y_train)

from sklearn import metrics
print('Predicting training dataset..............')
y_predict = rfcl.predict(X_test)


num_folds = 10
seed = 77


print('KFold Cross Validation in Progress........')
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)


results = cross_val_score(rfcl,X, y, cv=kfold)

average_accuracy = np.mean(abs(results))

print(rfcl.score(X_test, y_test))
print(classification_report(y_test, y_predict))

print('Accuracy after {} folds is: {}'.format(num_folds, average_accuracy))

%%timeit

Preparing Model...........
Predicting training dataset..............
KFold Cross Validation in Progress........
0.6675817949342946
              precision    recall  f1-score   support

           0       0.92      0.47      0.62     22071
           1       0.67      0.97      0.79     58596
           2       0.76      0.33      0.46     26567
           3       0.63      0.95      0.76     46861
           4       0.94      0.09      0.16     10490
           5       0.69      0.55      0.61     26418
           6       1.00      0.08      0.16      5270
           7       1.00      0.77      0.87      2412
           8       0.00      0.00      0.00      2777
           9       0.00      0.00      0.00       559
          10       0.00      0.00      0.00       239
          11       0.00      0.00      0.00       591
          12       0.56      0.77      0.65     23822
          13       1.00      0.00      0.00      7174

    accuracy                           0.67    233847
   

  _warn_prf(average, modifier, msg_start, len(result))
UsageError: Line magic function `%%timeit` not found.


In [12]:


import joblib
joblib.dump(rfcl, "random_forestModelGROUP_1.joblib")

['random_forestModelGROUP_1.joblib']

In [None]:
print (pd.DataFrame(dTree.feature_importances_, columns = ["Imp"], index = X_train.columns))
print(X_train.shape)
print(y_train.shape)

In [None]:
cm

In [None]:
# Convert to torch tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

# Create dataset
class ChurnDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    def __len__(self):
        return len(self.X_data)

# Create dataloader
train_data = ChurnDataset(X_train, y_train)
test_data = ChurnDataset(X_test, y_test)
train_loader = DataLoader(dataset=train_data, batch_size=10, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=10, shuffle=False)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sequence_length = 28
input_size = 23
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 1000
num_epochs = 2
learning_rate = 0.01
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

In [None]:
print(model)

In [None]:



# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'entropy', max_depth=12)
dTree.fit(X_train, y_train)
dTree.fit(X_test, y_test)

pred_train = dTree.predict(X_train)
pred_test = dTree.predict(X_test)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
#print(pd.DataFrame(dTree.feature_importances_, columns = ["Imp"], 
#                   index = X_train.columns))#Print the feature importance of the decision model.Putting this in comment form 

print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))

In [None]:

from sklearn.pipeline import Pipeline 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso 

models =[ KNeighborsRegressor(),
          AdaBoostRegressor(),
          GradientBoostingRegressor(),
          RandomForestRegressor(),
          LinearRegression(),
          Lasso(),
          DecisionTreeRegressor(),
          Ridge()]
        
table = pd.DataFrame(columns = ['Model Name', 'Accuracy Score'])
for classifier in models:
    pipe = Pipeline(steps =[('models',classifier)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    score = pipe.score(X_test, y_test)
    result = ({'Model Name':classifier,
               'Accuracy Score': score})
    table = table.append(result, ignore_index=True)
    


In [None]:
knr = KNeighborsRegressor()
knr_param = {"n_neighbors":[3,5,7],
             "weights":['uniform','distance'],
             "algorithm":['auto','kd_tree','brute'],
            }

gdb = GradientBoostingRegressor()
gdb_param = {'learning_rate':[0.1,0.5,1],
             'n_estimators':[100,200,400],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10],
             'max_features': ['auto','sqrt']}

rfr = RandomForestRegressor()
rfr_param = {'bootstrap': [True, False],
             'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10],
             'n_estimators': [200, 400, 600]}

ls  = Lasso()
ls_param = {'alpha':[0.1,0.2,0.5,0.8,1.0]}

drt = DecisionTreeRegressor()
drt_param = {'criterion':['mse','friedman_mse','mae'],
             'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 5, 10]}

ridge = Ridge()
ridge_param = {'alpha':[0.1,0.2,0.5,0.8,1.0],
               'solver':['auto','svd','lsqr']}

bgr = BaggingRegressor()
bgr_param = {'base_estimator': [LinearRegression(),Lasso(),RandomForestRegressor(),Ridge(),DecisionTreeRegressor()],
             #'max_features':[1,2,5,7,10],
             'n_estimators':[10,20,50,100]}

In [None]:
paramtable = pd.DataFrame(columns = ['model', 'parameters', 'acc score'])
def gridsearch(cv_model, parameters):
    rsmodel = GridSearchCV (estimator = cv_model, 
                            param_grid = parameters, 
                            cv = 5, 
                            verbose=2, 
                            n_jobs = -1)
    
    rsmodel.fit(X_train, y_train)
    
    result = ({'model': cv_model,
               'parameters': rsmodel.best_params_,
               'acc score': rsmodel.best_score_})
    
    global paramtable
    paramtable = paramtable.append(result, ignore_index=True)
    return paramtable
    

In [None]:
gridsearch(rfr,rfr_param)