In [52]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from dataprep.eda import plot, plot_correlation
from pycaret.classification import *

In [13]:
df = pd.read_csv("./land_mines.csv")
# df = df[df.S == 1]
# df = df.drop("S", axis=1)
df.info()
df.head()

In [14]:
soil_labels = ["Dry and Sandy",
               "Dry and Humus",
               "Dry and Limy",
               "Humid and Sandy",
               "Humid and Humus",
               "Humid and Limy"]
mine_labels = ["Null",
               "Anti-tank",
               "Anti-personnel",
               "Booby Trapped Anti-personnel",
               "M14 Anti-personnel"]
plt.figure(figsize=(20,24))
plot_number = 0
for soil_type in range(1,7):
    for mine_type in range(1,6):
        plot_number += 1
        plt.subplot(6, 5, plot_number)
        plt.hist(df[df.S == soil_type][df.M == mine_type]["V"])
        plt.title(mine_labels[mine_type-1])
        plt.xlabel('V')
        plt.ylabel(soil_labels[soil_type-1])

In [15]:
sns.pairplot(df, hue='M')

In [16]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(corr,
                     mask=mask,
                     square=True,
                     cbar=False,
                     annot=True,
                     linewidths=.5)

In [17]:
from scipy import stats
fit_output = stats.linregress(df[['V', 'H']])
slope, intercept, r_value, p_value, slope_std_error = fit_output
print(slope, intercept, r_value, p_value, slope_std_error)

In [18]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(
    df.drop(["M"], axis=1),
    df['M'],
    test_size=0.3,
    random_state=0)
print(train_data)
print(test_data)
print(train_labels)
print(test_labels)

In [21]:
model = linear_model.SGDClassifier(alpha=0.001, max_iter=100, random_state=0)
model.fit(train_data, train_labels)
model_predictions= model.predict(test_data)
print(metrics.accuracy_score(test_labels, model_predictions))
print(metrics.classification_report(test_labels, model_predictions))

In [22]:
scores = model_selection.cross_val_score(model, train_data, train_labels, cv=6)
print(scores.mean())

In [41]:
parameters_grid = {
    'max_iter': range(90,100),
    'alpha': np.linspace(0.0001, 0.001, num=10),
}
classifier = linear_model.SGDClassifier(random_state=0)
cv = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0) #train_labels
grid_cv = model_selection.GridSearchCV(classifier, parameters_grid, scoring='accuracy', cv=cv)
grid_cv.fit(train_data, train_labels)
print(grid_cv.best_estimator_)

In [43]:
classifier = linear_model.SGDClassifier(alpha=0.0079999999, max_iter=96, random_state=0)
model.fit(train_data, train_labels)
model_predictions= model.predict(test_data)
print(metrics.accuracy_score(test_labels, model_predictions))
print(metrics.classification_report(test_labels, model_predictions))
scores = model_selection.cross_val_score(model, train_data, train_labels, cv=10)
print(scores.mean())

In [45]:
sns.countplot(x=df.M)

In [46]:
plot_correlation(df, 'H', 'V')

In [47]:
plot_correlation(df, 'V', 'S')

In [48]:
plot(df, 'M', 'V')

In [49]:
plot(df, 'M', 'H')

In [50]:
plot(df, 'M','S')

In [51]:
df.hist(figsize=(8,7))

In [53]:
s = setup(df, target='M', session_id=123)

In [54]:
best = compare_models()

In [55]:
# evaluate_model(best)
plot_model(best, plot='feature')

In [56]:
plot_model(best, plot = 'auc')

In [57]:
plot_model(best, plot = 'confusion_matrix')

In [58]:
plot_model(best, plot='class_report')

In [59]:
predict_model(best)

In [60]:
predictions = predict_model(best, data=df, raw_score=True)
predictions.head()

In [61]:
predictions.query('M == prediction_label')

In [63]:
save_model(best, 'best_pipeline')

In [64]:
loaded_model = load_model('best_pipeline')
print(loaded_model)