# 1. Settings

In [None]:
# Import packages
import pandas as pd
import numpy as np

import os
import sys

# Import viz tools
from matplotlib import pyplot as plt
import seaborn as sns

# Modelling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Logger
import logging

In [None]:
# Configure logger
# Load config and logger
from eztools.operations import Logger, ConfigReader
logger = Logger('/mnt/logs/', logger_name = 'L&L').get_logger()

In [None]:
CWD_PATH = '/repos/my-awesome-project'

# Change working directory
os.chdir(CWD_PATH)
sys.path.append(CWD_PATH)

In [None]:
# Import packages
from src.etl.get_data import read_csv_data
from src.etl.get_missing_values import get_df_na, get_na_columns, impute_nan, plot_kdensity
from src.etl.get_train_test_set import get_train_test_set
from src.ml.get_lasso_model_predictions import get_lasso_model_predictions
from src.ml.get_model_accuracy import get_model_accuracy

In [None]:
# Read config.ini
CONFIG_PATH = '/repos/my-awesome-project/src/config/config.ini'
config = ConfigReader(CONFIG_PATH, config_tuple = False).read_config()

# Unpack config
DATA_PATH = config['data']['data_path']

# 2. ML Pipeline

## 2.1 Read data

We want to create a function named **read_csv_data()** with the following properties:
1. **Takes a full path for a file with a csv extension** <br>
E.g. folder/subfolder/my_csv_data.csv
2. **Output a dataframe with the correct rows and columns** <br>
E.g. Indexes should not be presented as columns

In [None]:
# Read data
df = read_csv_data(DATA_PATH)

In [None]:
# Plot info about the data
df.info()

## 2.2 Missing values

### 2.2.1 Calculate missing values

We want to create a function named **get_df_na()** with the following properties: <br>
1. **Contains two columns:** <br>
number_of_nan & number_of_nan_prc
2. **Captures the _absolute_ number of na values for a column name in the "number_of_nan" column** <br>
E.g. "pH" column contains 4 na values in total
3. **Captures the _percentage_ number of na values for a column name in the "number_of_nan_prc" column** <br>
E.g. "pH" column contains 78% na values in total
4. **Sort rows by descending order** <br>
I.e. Columns with the highest number of na values will appear on the top rows in our newly create dataframe

In [None]:
# Get df with na values
df_na = get_df_na(df)
df_na

We want to create a function named **get_na_columns()** witht the following properties: <br>
1. **Gets a dataframe as an input where each row corresponds to a feature (i.e. column from a raw dataframe)** <br>
E.g. the output of the get_df_na() function created above
2. **Returns a list that holds all the row names that contain na values** <br>
E.g. ['pH', 'density'] etc.
3. **Makes the selection of which row name to keep according to a function argument named nan_column** <br>
E.g. get_na_columns(df, nan_column = 'number_of_nan'), this means that the column 'number_of_nan' should be used for the row_name selection

In [None]:
# Columns with nan values
COLS_TO_IMPUTE = get_na_columns(df_na)
COLS_TO_IMPUTE

### 2.2.2 Impute nan values

In [None]:
# Plot distribution of the missing columns
plot_kdensity(df, 'pH')

We want to create a function named **impute_nan()** with the following properties:
1. **Gets a column or a list of columns to impute na values from with a specified replacement method (accept 'mean' & 'median')** <br>
E.g. col = ['pH', 'density'] with replacement = 'median', means to do a median imputation for the 'pH' and 'density' columns.

In [None]:
# Impute nan values
df = impute_nan(df, cols = 'pH', replacement = 'mean')

## 2.3 Modelling

### 2.3.1 Lasso Logistic regression

We want to create a function named **get_train_test_set()** with the following properties:
1. **Takes a dataframe as an input with a specifying response column to perform a 75%-25% split to train and test sets** <br>
I.e. 75% for training and 25% for testing as per the default values of sklearn
2. **Performs binary encoding for the response variable if specified** <br>
E.g. if encode = True, then check which is the positive class specified by the <i>pos_class</i> function argument

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = get_train_test_set(df, response = 'wine_colour', pos_class = 'white')

We want to create a function named **get_lasso_model_predictions()** with the following properties:
1. **Accepts a train dataset (X and y) to train a lasso model and performs predictions on unseen data (X test)** <br>
E.g. X_train, X_test, y_train

In [None]:
# Train lasso model and get predictions
y_pred = get_lasso_model_predictions(X_train, X_test, y_train)

# 3. Model evaluation

We want to create a function named **get_model_accuracy()** with the following properties:
1. **Accepts two pd.Series (1st: actual values, 2nd: expected/predicted values) and calculates the prediction accuracy** <br>
E.g. 0.5 means 50% accuracy, 0.33 means 33% accuracy, 1 means 100 accuracy

In [None]:
# Model evaluation
accuracy = get_model_accuracy(y_test, y_pred)
accuracy