In [1]:
! pip install lineapy




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
! python -m pip install pandas==1.3.2



In [None]:
%load_ext lineapy

In [None]:
lineapy.options

In [None]:
import lineapy
import joblib
import json
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


# Load dataset

In [None]:
# Get data 

import pandas as pd
from sklearn.datasets import load_iris

data = load_iris(as_frame=True)
dataset = data.frame
dataset.head()

In [None]:
# print labels for target values 

[print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]

In [None]:
# feature names

dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]

feature_names = dataset.columns.tolist()[:4]
feature_names

In [None]:
#save raw data as artifact
dataset_csv = './data/raw/iris.csv'
dataset.to_csv(dataset_csv, index=False)


In [None]:
dataset

In [None]:
print(pd.__version__)

In [None]:
#save raw data as artifact to lineapy
lineapy.save(dataset, "iris-raw")

# Features engineering

In [None]:
dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']
dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']

dataset = dataset[[
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',
    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',
    'target'
]]

In [None]:
dataset.head()

In [None]:
# Save features
features_path = './data/processed/featured_iris.csv'
dataset.to_csv(features_path, index=False)

In [None]:
#save features to lineapy
lineapy.save(dataset, "iris-preprocessed")

# Split dataset

In [None]:
test_size=0.2

## Splittail train/test

In [None]:
train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=42)
train_dataset.shape, test_dataset.shape

In [None]:
# Save train and test sets
trainset_path = './data/processed/train_iris.csv'
testset_path = './data/processed/test_iris.csv'

train_dataset.to_csv(trainset_path)
test_dataset.to_csv(testset_path)

In [None]:
#save train and test sets to lineapy
lineapy.save(train_dataset, "train-dataset")
lineapy.save(test_dataset, "test-dataset")

# Train

In [None]:
# Get X and Y

y_train = train_dataset.loc[:, 'target'].values.astype('int32')
X_train = train_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
# Create an instance of Logistic Regression Classifier CV and fit the data

logreg = LogisticRegression(C=0.001, solver='lbfgs', multi_class='multinomial', max_iter=100)
logreg.fit(X_train, y_train)

In [None]:
model_path= './models/model.joblib'
joblib.dump(logreg, model_path)

In [None]:
#save model to lineapy
lineapy.save(model_path, "logreg-model")

# Evaluate

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    
    return plt.gcf()

In [None]:
# Get X and Y

y_test = test_dataset.loc[:, 'target'].values.astype('int32')
X_test = test_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
prediction = logreg.predict(X_test)
cm = confusion_matrix(prediction, y_test)
f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')

In [None]:
# f1 score value
f1

In [None]:
# Save metrics
metrics_file = './reports/metrics.json'

metrics = {
    'f1': f1
}

with open(metrics_file, 'w') as mf:
    json.dump(
        obj=metrics,
        fp=mf,
        indent=4
    )


In [None]:
cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)

In [None]:
# Save confusion matrix image
confusion_matrix_image = './reports/confusion_matrix.png'
cm_plot.savefig(confusion_matrix_image)

In [None]:
#save confusion matrix to lineapy
lineapy.save(plot_confusion_matrix, "plot-confusion-matrix")

In [None]:
# Build a pipeline using artifacts

lineapy.to_pipeline(
    pipeline_name="course-ds-base-pipeline",
    artifacts=["iris-raw", "iris-preprocessed", "train-dataset", "test-dataset", "logreg-model", "plot-confusion-matrix"],
    dependencies={
        "plot-confusion-matrix": {"logreg-model"},
        "logreg-model": {"train-dataset", "test-dataset"},
        "train-dataset": {"iris-preprocessed"},
        "test-dataset": {"iris-preprocessed"},
        "iris-preprocessed": {"iris-raw"}
    },
    input_parameters=["test_size", "random_state"],
    output_dir="./dvc_pipeline_example/",
    framework="DVC",
)
