In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv(os.path.join('data.csv'))
data.head()

Unnamed: 0,State,Well Name,Depth (m),Porosity (%),Permeability_h (md),Permeability_v (md),Lithology (major),Well Classification,Longitude,Latitude,Onshore/Offshore,Decision
0,New South Wales,BMR Wollongong 1,18.7,4.7,0.1,0.1,shale,Stratigraphic,150.6,34.9667,Onshore,Non-Acceptable
1,New South Wales,BMR Wollongong 1,23.1,7.4,0.1,0.1,sandstone,Stratigraphic,150.6,34.9667,Onshore,Non-Acceptable
2,New South Wales,BMR Wollongong 1,27.0,9.9,0.2,0.2,sandstone,Stratigraphic,150.6,34.9667,Onshore,Non-Acceptable
3,New South Wales,BMR Wollongong 1,45.8,4.0,0.1,0.1,siltstone,Stratigraphic,150.6,34.9667,Onshore,Non-Acceptable
4,New South Wales,BMR Wollongong 1,70.8,3.1,0.1,0.1,siltstone,Stratigraphic,150.6,34.9667,Onshore,Non-Acceptable


In [4]:
data.describe(include='all')

Unnamed: 0,State,Well Name,Depth (m),Porosity (%),Permeability_h (md),Permeability_v (md),Lithology (major),Well Classification,Longitude,Latitude,Onshore/Offshore,Decision
count,714,714,714.0,714.0,714.0,714.0,714,714,714.0,714.0,714,714
unique,8,24,,,,,11,5,,,2,2
top,Northern Territory,East Mereenie 1,,,,,sandstone,Exploration,,,Onshore,Non-Acceptable
freq,144,100,,,,,543,411,,,520,634
mean,,,1753.248739,13.464426,158.193137,93.267927,,,140.601683,28.336971,,
std,,,926.32151,7.680738,575.452352,441.657868,,,9.88136,8.065947,,
min,,,18.7,0.0,0.0,0.0,,,115.3939,7.3811,,
25%,,,976.05,7.5,0.0,0.0,,,133.9667,24.0086,,
50%,,,1713.05,11.65,0.85,0.1,,,142.9883,27.4267,,
75%,,,2666.45,19.1,23.5,3.0,,,148.0969,38.2008,,


In [5]:
target = data['Decision']
target_names = ['Acceptable', 'Non-Acceptable']

In [6]:
data = data.drop(columns=['State', 'Well Name', 'Depth (m)', 'Permeability_v (md)', 'Lithology (major)',
                          'Well Classification', 'Longitude', 'Latitude', 'Onshore/Offshore'])
data.head()

Unnamed: 0,Porosity (%),Permeability_h (md),Decision
0,4.7,0.1,Non-Acceptable
1,7.4,0.1,Non-Acceptable
2,9.9,0.2,Non-Acceptable
3,4.0,0.1,Non-Acceptable
4,3.1,0.1,Non-Acceptable


In [7]:
data = data.drop('Decision', axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Porosity (%),Permeability_h (md)
0,4.7,0.1
1,7.4,0.1
2,9.9,0.2
3,4.0,0.1
4,3.1,0.1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [9]:
train_scores = []
test_scores = []
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
print(f'k: {knn}, Train/Test Score: {train_score:.3f}/{test_score:.3f}')

k: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform'), Train/Test Score: 0.951/0.939


In [10]:
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
new_data = np.array([[0, 0], [10, 100], [50, 500]])

In [13]:
print(f'Training Data Score: {classifier.score(X_train, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test, y_test)}')

Training Data Score: 0.930841121495327
Testing Data Score: 0.8994413407821229


In [14]:
predictions = classifier.predict(new_data)
print('Classes are either 0 (Acceptable) or 1 (Non-Acceptable)')
print(f'The new points were classified as: {predictions}')

Classes are either 0 (Acceptable) or 1 (Non-Acceptable)
The new points were classified as: ['Non-Acceptable' 'Non-Acceptable' 'Acceptable']


In [18]:
predictions = classifier.predict(X_test)
predictions = pd.DataFrame({'Prediction': predictions, 'Actual': y_test})
predictions.head()

Unnamed: 0,Prediction,Actual
120,Non-Acceptable,Non-Acceptable
329,Non-Acceptable,Non-Acceptable
39,Non-Acceptable,Acceptable
294,Non-Acceptable,Non-Acceptable
654,Non-Acceptable,Acceptable
