In [1]:
%reset -f

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv("./land_mines.csv")
# min_max_scaler = MinMaxScaler()
# df[["S", "M"]] = min_max_scaler.fit_transform(df[["S", "M"]])
X = df.drop(["M"], axis=1)
y = df.M
tsne_features = TSNE().fit_transform(X)
df["x"] = tsne_features[:,0]
df["y"] = tsne_features[:,1]

In [None]:
plt.figure(figsize=(6,4))
xx=np.linspace(0,1,50)
plt.plot(xx, [2*x*(1-x) for x in xx], label='gini')
plt.plot(xx, [4*x*(1-x) for x in xx], label='2*gini')
plt.plot(xx, [-x*np.log2(x) - (1-x) * np.log2(1-x) for x in xx], label='entropy')
plt.plot(xx, [1 - max(x,1-x) for x in xx], label='missclass')
plt.plot(xx, [2 - 2 * max(x,1-x) for x in xx], label='2*missclass')
plt.xlabel('p+')
plt.ylabel('criterion')
plt.title('criteria of quality as a function of p+ (binary classification)')
plt.legend()

In [5]:
soil_labels = ["Dry and Sandy",
               "Dry and Humus",
               "Dry and Limy",
               "Humid and Sandy",
               "Humid and Humus",
               "Humid and Limy"]
mine_labels = ["Null",
               "Anti-tank",
               "Anti-personnel",
               "Booby Trapped Anti-personnel",
               "M14 Anti-personnel"]
train_colors = ["w",
                "r",
                "g",
                "b",
                "y",
                "p"]

#mine_vals = [0.0, 0.25, 0.5, 0.75, 1.0]
#soil_vals = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
mine_vals = [1,2,3,4,5]
soil_vals = [1,2,3,4,5,6]
mine_classes = list(map(lambda value: df[np.isclose(df["M"], value)], mine_vals))
soil_classes = list(map(lambda value: df[np.isclose(df["S"], value)], soil_vals))
x_y_values_transpose = lambda dataframe: dataframe[["x", "y"]].values.transpose()

mine_classified_train_data = list(map(
    x_y_values_transpose,
    mine_classes))

soil_classified_train_data = list(map(
    lambda dfs: list(map(
        x_y_values_transpose,
        list(map(lambda value: dfs[dfs["M"] == value], mine_vals)))),
    soil_classes))

In [None]:
plt.figure(figsize=(10,8))
for i in range(5):
    plt.scatter(mine_classified_train_data[i][0],
                mine_classified_train_data[i][1],
                c=train_colors[i], s=50, edgecolors='black', linewidth=1.5)

In [None]:
for j in range(6):
    plt.figure(figsize=(10,8))
    plt.title(soil_labels[j])
    for i in range(5):
        plt.scatter(
            soil_classified_train_data[j][i][0],
            soil_classified_train_data[j][i][1],
            c=train_colors[i], s=50, edgecolors='black', linewidth=1.5)

In [15]:
from sklearn.tree import export_graphviz
from IPython.display import display, SVG
from graphviz import Source
from sklearn import tree

def tree_graph(clf_tree, feature_names):
    #tree_str = export_graphviz(clf_tree, feature_names=feature_names, filled=True, out_file=None)
    #graph = Source(tree_str)
    #display(SVG(graph.pipe(format='svg')))
    fig, ax = plt.subplots(figsize=(10, 10))
    tree.plot_tree(clf_tree, fontsize=10)
    plt.show()

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def tree_prediction(X_train, y_train, colnames, depth=3):
    clf_tree = DecisionTreeClassifier(criterion="entropy", max_depth=depth, random_state=17)
    clf_tree.fit(X_train, y_train)
    predicted = clf_tree.predict(X_train)
    accuracy = accuracy_score(predicted, y_train)
    print(f'Accuracy: {accuracy}')
    tree_graph(clf_tree, colnames)

In [None]:
tree_prediction(X, y, ["V", "H", "S"], 4)

In [None]:
for i in range(6):
    df_soil_separated = df[df["S"]==i+1]
    X_soil_separated = df_soil_separated[["V", "H"]]
    y_soil_separated = df_soil_separated.M
    print(soil_labels[i])
    tree_prediction(X_soil_separated, y_soil_separated, ["V", "H"], 5)