In [None]:
pip install Wand

In [None]:
pip install mlxtend

In [None]:
# Loading helpful packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_learning_curves
from matplotlib.backends.backend_pdf import PdfPages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, roc_curve, auc
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import learning_curve
from xgboost import XGBClassifier as xgb
from xgboost import plot_tree
from tabulate import tabulate
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loading extraction data
df_1 = pd.read_excel("/kaggle/input/extraction-non-extraction/final_data_extraction.xlsx", index_col = 'Case no.')
#Loading non-extraction data
df_2 = pd.read_excel("/kaggle/input/extraction-non-extraction/final_data_non_extraction.xlsx", index_col = 'Case no.')

In [None]:
print("Information on dataframe containing extraction data : ")
df_1.info()

In [None]:
elll = df_1['E-line to lower lip'].tolist()
for i in range(len(elll)):
    if isinstance(elll[i], str):
        try:
            elll[i] = float(elll[i])
        except:
            b = elll[i].split(" ")
            s = ''
            for j in b:
                s+=j
            elll[i] = float(s)
df_1['E-line to lower lip'] = elll

In [None]:
print("Information on dataframe containing extraction data after cleaning \'E-line to lower lip\' column : ")
df_1.info()

In [None]:
print("Information on dataframe containing non-extraction data : ")
df_2.info()

In [None]:
df_1 = df_1.rename(columns={'β angle': 'beta angle', 'UI-NA in °': 'UI-NA in deg', 'LI-NB in °': 'LI-NB in deg', 'E-line to lower lip':'E-Line to lower lip'})
df_2 = df_2.rename(columns={'β angle': 'beta angle', 'UI-NA in °': 'UI-NA in deg', 'LI-NB in °': 'LI-NB in deg', 'Nasiomental angle':'Nasomental angle', 'E-line to lower lip':'E-Line to lower lip'})

In [None]:
# Extraction
df_1['Category'] = 1 
# Non-extraction
df_2['Category'] = 0 

In [None]:
#Merging df_1 and df_2
df_merged = pd.concat([df_1, df_2], axis=0)
df_merged.sample(frac=1).reset_index(drop=True, inplace=True)

In [None]:
print("Information on the merged dataframe : ")
df_merged.info()
print("Description of the merged dataframe : ")
df_merged.describe()

In [None]:
#Removing the rows where E-Line to lower lip is null 
df_merged = df_merged[df_merged['E-Line to lower lip'].notna()]

In [None]:
print("Information on the merged dataframe : ")
df_merged.info()
print("Description of the merged dataframe : ")
df_merged.describe()

In [None]:
print("head(50) of df_merged dataframe : ")
df_merged.head(50)

In [None]:
Y = df_merged['Category']
X = df_merged.drop(['Category'], axis=1)

In [None]:
# Splitting the dataset into train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#Concatenating X_train and Y_train to get the correlation matrix
train_data_corr = pd.concat([X_train, Y_train], axis=1)

In [None]:
#Font specifications for plotting and tabulation
font1 = {'family': 'serif',
        'color':  'darkred',
        'weight': 'bold',
        'size': 18,
        }

font2 = {'family': 'serif',
        'color':  'darkolivegreen',
        'weight': 'bold',
        'size': 12,
        }

# font3 = {'family': 'serif',
#         'color':  'whitesmoke',
#         'weight': 'bold',
#         }

#font3 = fm.FontProperties(family=['serif'], weight='bold', color='red')

In [None]:
# Create correlation matrix
corr_matrix = train_data_corr.corr().abs()
print("Correlation matrix")

#Plotting the correlation heatmap
fig, ax = plt.subplots(figsize=(12,12))
ax.tick_params(colors='darkblue')
ax.xaxis.set_ticks_position('top')
ax.xaxis.set_label_position('top')
plt.title("Correlation Matrix", fontdict = font1, y = -0.1)
plt.xlabel('Features', fontdict = font2)
plt.ylabel('Features', fontdict = font2) 
sns.heatmap(corr_matrix, annot=True, fmt='.2f', linewidths=1.0, xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns, ax=ax)
plt.savefig("Correlation_matrix", facecolor = "white", transparent=False)    

In [None]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.5
drop_columns = [column for column in upper.columns if any(upper[column] > 0.5)]
drop_columns.extend(['Name', 'Bolton\'s'])

#Removing 'Category' from drop_columns since its already removed from X_train and X_test
if 'Category' in drop_columns:
    drop_columns.remove('Category')

print("Columns to be dropped : ")
for col in drop_columns:
     print(col)

# Drop features 
X_train.drop(drop_columns, axis=1, inplace=True)
X_test.drop(drop_columns, axis=1, inplace=True)

In [None]:
print("Information on X_train dataframe : ")
X_train.info()

In [None]:
print("Information on X_test dataframe : ")
X_test.info()

In [None]:
print("Classification models used in this experiment : ")
model_names = ["DecisionTreeClassifier", "RandomForestClassifier", "XGBoostClassifier"]

for m in model_names:
    print(m)

X_train = X_train.values
X_test = X_test.values

#Decision Tree
dt_model = dt().fit(X_train, Y_train)

#Random Forest
rf_model = rf().fit(X_train, Y_train)

#XGBoost
xgb_model = xgb().fit(X_train, Y_train)

models = [dt_model, rf_model, xgb_model]

max_f1 = -float('inf')
best_model = ""
tabulation_data = []
Y_names = [""]

for model in models:
    y_1 = model.predict(X_test)
    y_2 = Y_test
    model_name = str(type(model))[1:-2].split(" ")[1].split(".")[-1]
    file_name_cm = "CM_" + model_name
    file_name_roc = "ROC_" + model_name
    print("\nPrinting the metrics for the",model_name,"model :")
    conf_matrix = confusion_matrix(y_2, y_1)
    
    fig, ax = plt.subplots(figsize=(8,8))
    ax.tick_params(colors='darkblue')
    ax.xaxis.set_ticks_position('top')
    ax.xaxis.set_label_position('top')
    ax.set_yticklabels(ax.get_yticks(), rotation = 75)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Non-extraction', 'Extraction'])
    disp.plot(ax=ax, colorbar=False)
    title_str = 'Confusion Matrix for ' + model_name
    plt.title(title_str, fontdict = font1, y = -0.1)
    plt.xlabel('Predicted Labels', fontdict = font2)
    plt.ylabel('Actual Labels', fontdict = font2) 
    plt.savefig(file_name_cm, facecolor = "white", sa=False)
    plt.show()
    
    fig, ax = plt.subplots(figsize=(8,8))
    fpr, tpr, thresholds = roc_curve(y_2, y_1)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=model_name)
    display.plot(ax = ax, name = "ROC curve for " + model_name)
    plt.title("ROC curve for " + model_name, fontdict = font1, y = -0.15)
    plt.savefig(file_name_roc, facecolor = "white", sa=False)
    plt.show()

    accuracy_sc = round(accuracy_score(y_2, y_1), ndigits = 3)
    f1_sc = round(f1_score(y_2, y_1), ndigits = 3)
    precision_sc = round(precision_score(y_2, y_1), ndigits = 3)
    recall_sc = round(recall_score(y_2, y_1), ndigits = 3)
    print("\n\nClassification report : \n")
    print(classification_report(y_2, y_1, target_names=['Non-extraction', 'Extraction']))
    print('Accuracy :', accuracy_sc)
    f1 = f1_sc
    if f1 > max_f1:
        max_f1 = f1
        best_model = {"model":model, "model_name":model_name}
    print("F1 score :", f1_sc)
    print("Precision score :", precision_sc)
    print("Recall score :", recall_sc)
    print("ROC Area under Curve for " + model_name + " :", round(roc_auc, ndigits = 3))
    tabulation_data.append([model_name, accuracy_sc, precision_sc, recall_sc, f1_sc])
print("\nBest model for the given data is :", best_model["model_name"])

In [None]:
plot_learning_curves(X_train, Y_train, X_test, Y_test, dt_model)
plt.title("Learning curve of "+model_names[0], fontdict = font1, y = -0.3)
plt.savefig("Learning_curve_"+model_names[0], sa=False, bbox_inches='tight')
plt.show()

plot_learning_curves(X_train, Y_train, X_test, Y_test, rf_model)
plt.title("Learning curve of "+model_names[1], fontdict = font1, y = -0.3)
plt.savefig("Learning_curve_"+model_names[1], sa=False, bbox_inches='tight')
plt.show()

plot_learning_curves(X_train, Y_train, X_test, Y_test, xgb_model)
plt.title("Learning curve of "+model_names[2], fontdict = font1, y = -0.3)
plt.savefig("Learning_curve_"+model_names[2], sa=False, bbox_inches='tight')
plt.show()

In [None]:
feature_names = X.columns.values
class_names = ['Non-extraction', 'Extraction']

fig, ax = plt.subplots(figsize=(42,42))
ax.set_title("Visual represenation of decision tree")
tree.plot_tree(dt_model, feature_names = feature_names, class_names = class_names, filled=True)
plt.savefig("DecisionTreeClassifier_Visualization", dpi=700, bbox_inches='tight')
plt.show()

fig, ax = plt.subplots(figsize=(42,42))
ax.set_title("Visual represenation of the first decision tree(out of 100 trees) in the random forest")
tree.plot_tree(rf_model.estimators_[0], feature_names = feature_names, class_names = class_names, filled = True)
plt.savefig('RandomForestClassifier_Visualization', dpi=700, bbox_inches='tight')
plt.show()

fig, ax = plt.subplots(figsize=(42,42))
ax.set_title("Visual represenation of the XGBClassifier")
plot_tree(xgb_model, num_trees = 1)
plt.savefig('XGBClassifier_Visualization', dpi=700, bbox_inches='tight')
plt.show()

In [None]:
from wand.image import Image
print("Decision-making success rates of each classifier(%)")

col_names = ["Classifier Name", "Accuracy", "Precision score", "Recall score", "F1 Score"]

#Displaying the table
print(tabulate(tabulation_data, headers=col_names, tablefmt="outline"))

#Creating a dataframe
op = pd.DataFrame(tabulation_data, columns=col_names, index=model_names)

#Plotting the dataframe
ax = op.plot(figsize=(18, 8), title='Bar plot of performance metrics of the three models', kind='bar')
for container in ax.containers:
    ax.bar_label(container)
ax.tick_params(colors='darkblue')
plt.savefig("tabulations_graph", facecolor = "white", transparent=False, bbox_inches='tight')

#Converting dataframe to pdf

fig, ax =plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
plt.title("Performance metrics of the three models" ,y=1.0)
row_col_headers = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (2, 0), (3, 0)]
the_table = ax.table(cellText=op.values, colLabels=op.columns, loc='center')
row_col_cells = [the_table.get_celld()[row_col_header] for row_col_header in row_col_headers]
for cell in row_col_cells:
    cell.set(facecolor = 'dimgray')
    cell.get_text().set_color('white')
    cell.get_text().set_fontweight('heavy')
    
pdf = PdfPages("tabulation.pdf")
pdf.savefig(fig, bbox_inches='tight')
pdf.close()

#Converting pdf to png
pdf_filename = "tabulation.pdf"
with(Image(filename=pdf_filename, resolution=120)) as source: 
    for image in source.sequence:
        Image(image).save(filename="tabulation.png")

In [None]:
df_merged_columns = list(df_merged.columns)
remove_indices = [df_merged_columns.index(i) + 1 for i in drop_columns]
remove_indices.append(0)
remove_indices.append(19)
remove_indices.sort()
cnt = 0
print("\n\n\n\n")
ip = input("Enter the data :")
splitter = chr(9)
ip_list = ip.split(splitter)
patient_case_num = ip_list[0]
patient_name = ip_list[1]
for ind in remove_indices:
    del ip_list[ind-cnt]
    cnt += 1

In [None]:
ip_ndarray = np.array(ip_list, dtype='float')
prediction = best_model["model"].predict([ip_ndarray])
result = "extract" if prediction[0]==1 else "not extract"
print("\n\n\n\n\n\n************************************")
ips = ip[1:-1].split(", ")[:2]
print("Patient "+patient_name+" bearing case no. "+patient_case_num+":")
print("Decision is to", result)
print("************************************")