In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import pickle

import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv('/kaggle/input/eestech-challenge-2022/Train-dataset.csv')

In [6]:
display(df.info())
display(df.head(5))
display(df.tail(5))

In [7]:
df

In [8]:
wells = df['WELL'].unique()
print(wells)

In [9]:
#see how well logging curves look by depth parameter (MD) for different wells
well_data = df[df['WELL'] == wells[8]]

fig, axs = plt.subplots(1, len(well_data.columns)-7, figsize=(10, 8), sharey=True)
for ic, col in enumerate(set(well_data.columns)-set(['WELL',
                                                     'MD',
                                                     'DEPOSITIONAL_ENVIRONMENT',
                                                     'LITH_NAME',
                                                     'LITH_CODE',
                                                      'X',
                                                      'Y'])):
    axs[ic].plot(well_data[col], well_data['MD'])
    axs[ic].set_xlabel(col)
    
axs[0].set_ylim(well_data['MD'].values[-1], well_data['MD'].values[0])

In [10]:
df['LITH_NAME'].value_counts()

In [11]:
lithology_color = {100: '#997950',
                    200: '#cccccc',
                    300: '#ffff00',
                    400: '#ffccff',
                    500: '#006600',
                    600: '#666600',
                    700: '#F8DE7E',
                    800: '#999DA0',
                    900: '#9933ff',
                    1000: '#ff6600',
                    1100: '#000000',
                    1200: '#98FB98',
                    1300: '#ff9999',
                    1400: '#CEB180',
                    1500: '#3BB143'
                    }

lithology_key = {100: 'Clay',
                 200: 'Siltstone/Loess',
                 300: 'Marl',
                 400: 'Clay marl',
                 500: 'Clay sandstone',
                 600: 'Sandstone',
                 700: 'Limestone',
                 800: 'Tight',
                 900: 'Dolomite',
                 1000: 'Coal',
                 1100: 'Coal clay',
                 1200: 'Marly sandstone',
                 1300: 'Sandy marl',
                 1400: 'Marl clay',
                 1500: 'Siltstone clay'
                  }

In [12]:
def autolabel(ax, bars, labels):
    """Attach a text label above each bar."""
    for bar, label in zip(bars, labels):
        height = bar.get_height()
        ax.annotate('{}'.format(label),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [13]:
counts = df['LITH_CODE'].value_counts()
names = []
percentage = []
numbers = []
colors = []
N = df['LITH_CODE'].shape[0]
for item in counts.iteritems():
    names.append(lithology_key[item[0]])
    percentage.append(float(item[1]) / N*100)
    numbers.append(int(item[1]))
    colors.append(lithology_color[item[0]])
    
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
bar = ax.bar(x=np.arange(len(names)), height=percentage, color=colors)
ax.set_xticklabels(names, rotation=45)
ax.set_xticks(np.arange(len(names)))
ax.set_ylabel('Lithology presence (%)')

autolabel(ax, bar, numbers)

In [14]:
MAPPING = {
    'Continental': 1,
    'Transitional': 2,
    'Marine': 3,
}

df['D_Env']=df['DEPOSITIONAL_ENVIRONMENT'].apply(lambda x: MAPPING[x])
df.info()

In [15]:
Feature = df[['MD','GR', 'RT', 'DEN', 'CN','D_Env']]

In [16]:
X = Feature
X= preprocessing.StandardScaler().fit(X).transform(X)

In [17]:
y = df['LITH_CODE']


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7)

print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [19]:
def show_conf_matrix(y_test, y_pred, classes):
    
    # Calculate confusion matrix
    conf = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(12,12))
    sns.set(font_scale=1)
    sns.heatmap(conf, annot=True, annot_kws={"size": 16}, fmt="d", linewidths=.5, cmap="YlGnBu", xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted value')
    plt.ylabel('True value')
    
    plt.show()

In [20]:
target_lithologys = []
labels = np.sort(y_test.unique())

for l_code in labels:
    lithology = lithology_key[l_code]
    target_lithologys.append(lithology)

print(labels)
print(target_lithologys)

In [21]:
#fitting the K parameter

Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):

    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

In [22]:
plt.plot(range(1,Ks), mean_acc, 'r')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc - 3 * std_acc,mean_acc + 3 * std_acc, alpha=0.10, color='r')
plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()


print( "The highest accuracy", round((mean_acc.max()),2), "was obtained with k=",mean_acc.argmax()+1,"." ) 

In [23]:
model = KNeighborsClassifier(n_neighbors = 3).fit(X_train,y_train)

y_predict = model.predict(X_test)

print("Train set accuracy: ", round(metrics.f1_score(y_train, model.predict(X_train), average = 'micro'), 2))
print("Test set accuracy: ", round(metrics.f1_score(y_test, y_predict, average = 'micro'), 2))

show_conf_matrix(y_predict, y_test, target_lithologys)

print('==================================================================================')
print('Test set classification report')
print('----------------------------------------------------------------------------------')
print(classification_report(y_test, y_predict, labels=labels, target_names=target_lithologys, zero_division=0))
print('==================================================================================')

In [24]:
# Save the model to file in the current working directory

# Pkl_Filename = "KNN_Model.pkl"  
# # Pkl_Filename = '/kaggle/output/working/KNN_Model.pkl' 


# with open(Pkl_Filename, 'wb') as file:  
#     pickle.dump(model, file)

# Validate your model

In [25]:
test_dataset = pd.read_csv('/kaggle/input/eestech-challenge-2022/Test-dataset.csv')

display(test_dataset.info())
display(test_dataset.head(5))
display(test_dataset.tail(5))

In [26]:
test_dataset['D_Env']=test_dataset['DEPOSITIONAL_ENVIRONMENT'].apply(lambda x: MAPPING[x])

In [27]:
test_dataset

In [28]:
test_feature = test_dataset[['MD','GR', 'RT', 'DEN', 'CN','D_Env']]

In [29]:
# Testing feature
X_test = test_feature

# Normalizing Test Data
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)

In [30]:
target_lithologys = []
labels = np.sort(y_test.unique())

for l_code in labels:
    lithology = lithology_key[l_code]
    target_lithologys.append(lithology)

print(labels)
print(target_lithologys)

In [31]:
# Pkl_Filename = "KNN_Model.pkl"

# with open(Pkl_Filename, 'rb') as file:  
#     model = pickle.load(file)

In [32]:
test_yhat = model.predict(X_test)

In [33]:
test_yhat

In [34]:
test_dataset['LITH_CODE'] = test_yhat

In [35]:
test_dataset

In [36]:
submission = test_dataset[['Id','LITH_CODE']]

In [37]:
submission.to_csv('submission.csv', index=False)