In [3]:
%load_ext autoreload
%autoreload 2

import itertools
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import yaml

# import plot_confusion_matrix()
from src.report.visualize import plot_confusion_matrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Go to project root folder
%cd ..

c:\Users\Admin\Documents\Git\course-ds-base-1


# Config

In [4]:
# Read config
import pprint

with open('params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

pprint.pprint(config)

{'base': {'random_state': 42},
 'data': {'dataset_csv': 'data/raw/iris.csv',
          'features_path': 'data/processed/featured_iris.csv',
          'test_size': 0.2,
          'testset_path': 'data/processed/test_iris.csv',
          'trainset_path': 'data/processed/train_iris.csv'},
 'reports': {'confusion_matrix_image': 'reports/confusion_matrix.png',
             'metrics_file': 'reports/metrics.json'},
 'train': {'clf_params': {'C': 0.001,
                          'max_iter': 100,
                          'multi_class': 'multinomial',
                          'solver': 'lbfgs'},
           'model_path': 'models/model.joblib'}}


# Load dataset

In [5]:
# Get data 

# import pandas as pd
# from sklearn.datasets import load_iris

# data = load_iris(as_frame=True)
# dataset = data.frame
# dataset.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# print labels for target values 

# [print(f'{target}: {label}') for target, label in zip(data.target.unique(), data.target_names)]

In [None]:
# feature names

# dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()]

# feature_names = dataset.columns.tolist()[:4]
# feature_names

In [None]:
# Save raw data
# dataset.to_csv(config['data']['dataset_csv'], index=False)

In [None]:
!python src/stages/data_load.py --config=params.yaml 

In [10]:
from src.stages.data_load import data_load

data_load(config_path = 'params.yaml')

Data load complete


# Features engineering

In [None]:
dataset['sepal_length_to_sepal_width'] = dataset['sepal_length'] / dataset['sepal_width']
dataset['petal_length_to_petal_width'] = dataset['petal_length'] / dataset['petal_width']

dataset = dataset[[
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
#     'sepal_length_in_square', 'sepal_width_in_square', 'petal_length_in_square', 'petal_width_in_square',
    'sepal_length_to_sepal_width', 'petal_length_to_petal_width',
    'target'
]]

In [None]:
dataset.head()

In [None]:
# Save features
dataset.to_csv(config['data']['features_path'], index=False)

# Split dataset

In [None]:
train_dataset, test_dataset = train_test_split(
    dataset, test_size=config['data']['test_size'],
    random_state=config['base']['random_state']
)
train_dataset.shape, test_dataset.shape

In [None]:
# Save train and test sets
train_dataset.to_csv(config['data']['trainset_path'])
test_dataset.to_csv(config['data']['testset_path'])

# Train

In [None]:
# Get X and Y

y_train = train_dataset.loc[:, 'target'].values.astype('int32')
X_train = train_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
# Create an instance of Logistic Regression Classifier CV and fit the data

logreg = LogisticRegression(
    **config['train']['clf_params'],
    random_state=config['base']['random_state']
)
logreg.fit(X_train, y_train)

In [None]:
joblib.dump(logreg, config['train']['model_path'])

# Evaluate

In [None]:
# Get X and Y

y_test = test_dataset.loc[:, 'target'].values.astype('int32')
X_test = test_dataset.drop('target', axis=1).values.astype('float32')

In [None]:
prediction = logreg.predict(X_test)
cm = confusion_matrix(prediction, y_test)
f1 = f1_score(y_true = y_test, y_pred = prediction, average='macro')

In [None]:
# f1 score value
f1

In [None]:
# Save metrics
metrics = {
    'f1': f1
}

with open(config['reports']['metrics_file'], 'w') as mf:
    json.dump(
        obj=metrics,
        fp=mf,
        indent=4
    )

In [None]:
cm_plot = plot_confusion_matrix(cm, data.target_names, normalize=False)

In [None]:
# Save confusion matrix image
cm_plot.savefig(config['reports']['confusion_matrix_image'])