This notebook contains functions relevent for the machine learning case study

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Define-all-functions-within-notebook" data-toc-modified-id="Define-all-functions-within-notebook-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Define all functions within notebook</a></span></li><li><span><a href="#The-functions" data-toc-modified-id="The-functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>The functions</a></span><ul class="toc-item"><li><span><a href="#FUNCTION---linear_reg_model_creation" data-toc-modified-id="FUNCTION---linear_reg_model_creation-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>FUNCTION - linear_reg_model_creation</a></span></li><li><span><a href="#FUNCTION---prediction_using_model" data-toc-modified-id="FUNCTION---prediction_using_model-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>FUNCTION - prediction_using_model</a></span></li><li><span><a href="#FUNCTION---single_step_create_predict" data-toc-modified-id="FUNCTION---single_step_create_predict-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>FUNCTION - single_step_create_predict</a></span></li></ul></li></ul></div>

# Imports

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from importnb import Notebook
with __import__('importnb').Notebook():
    from df_to_fig import df_to_hist

# Define all functions within notebook

In [15]:
def in_out_def():
    return {
        'linear_reg_model_creation': {'inputs': {"df": "variable", "output_column": "text_input",
                                                 "columns_not_required": "text_input"},
                                      'outputs': {"reg": "variable", 'scaler_used': 'variable',
                                                  "mean_squared_error": "variable", "regression_cols": "variable"}},
        'prediction_using_model': {'inputs': {"model": "variable", 'scaler_used': 'variable',
                                              'df': "variable", "regression_cols": 'variable'},
                                   'outputs': {'df': 'variable'}},
        'single_step_create_predict': {'inputs': {'training_csv': 'file_browse', 'predict_csv': 'file_browse',
                                                  'output_column': "text_input", 'columns_not_required': "text_input"},
                                       'outputs': {'df': 'variable', "fig": "graph"}}


    }

# The functions

## FUNCTION - linear_reg_model_creation

In [16]:
def linear_reg_model_creation(df=None, output_column=None, columns_not_required=None):
    '''Take df, a target column, and columns to discard and create a linear regression model relating the inputs and outputs'''
    try:
        # ensure the input arguments are the correct type
        if isinstance(df, pd.DataFrame) and isinstance(output_column, str) and isinstance(columns_not_required, str):
            # make sure there are no spaces in the column header names
            cols = [a.replace(" ", "_") for a in df.columns]
            df.columns = cols  # rename columns using spaceless names
            columns_not_required = "".join(columns_not_required.split()).split(
                ',')  # convert text list to actual list after removing whitespace
            # ensure that column names provided exist in DataFrame
            if output_column in [*df.columns] and False not in [True if cc in [*df.columns] else False for cc in columns_not_required]:
                df = df.drop(columns_not_required, axis=1)
                # create a DataSeries of the output column
                Y = df[output_column]
                # create a DataFrame with only prediction columns
                X = df.drop([output_column], axis=1)
                # initialise a scaler for normalising the data
                min_max_scaler = preprocessing.MinMaxScaler()
                # scale x
                X_scaled = pd.DataFrame(
                    min_max_scaler.fit_transform(X), columns=X.columns)
                # randomly split the DataFrame rows. 70% of them will be used for training, 30% for testing the trained model
                X_train, X_test, Y_train, Y_test = train_test_split(
                    X_scaled, Y, test_size=0.3, random_state=42)
                reg = LinearRegression()  # initialise a ML model
                _ = reg.fit(X_train, Y_train)  # train the model
                # use the model to predict using the test dataset
                predictions = reg.predict(X_test)
                # assess the performance of the model
                error = mean_squared_error(Y_test, predictions)
                return {"reg": reg, "scaler_used": min_max_scaler, "mean_squared_error": error, "regression_cols": [*X_test.columns]}
    except:
        pass
    return {"reg": None, "scaler_used": None, "mean_squared_error": None, "regression_cols": None}

## FUNCTION - prediction_using_model

In [17]:
def prediction_using_model(model=None, scaler_used=None, df=None, regression_cols=None):
    '''Use a sklearn ML model to predict outputs for a provided data set'''
    if isinstance(regression_cols, str):  # if cols in string list form convert to python list
        try:
            regression_cols = "".join(regression_cols.split()).split(',')
        except:
            pass
    # ensure input args are the correct types
    if isinstance(model, type(LinearRegression())) and isinstance(scaler_used, type(preprocessing.MinMaxScaler())) and isinstance(df, pd.DataFrame) and isinstance(regression_cols, list):
        try:
            X = df[regression_cols]  # select only columns used in prediction
            X_scaled = pd.DataFrame(scaler_used.transform(
                X), columns=X.columns)  # scale the DataFrame
            predictions = model.predict(X_scaled)  # make the predictiosn
            X["Predicted_%_Silica_Concentrate"] = pd.Series(
                predictions)  # add the predictions to  the DataFrame
            return{"df": X}
        except:
            pass
    return{"df": None}

## FUNCTION - single_step_create_predict

In [2]:
def single_step_create_predict(training_csv=None, predict_csv=None, output_column=None, columns_not_required=None):
    '''Single step model creation and prediction'''
    df=predict_df=None
    try:
        df = pd.read_csv(training_csv)
        predict_df = pd.read_csv(predict_csv)
        # create a list of tupples. each tuple contains a list of columns and the value to multiply them by
        conversions = [(['Starch_Flow', 'Amina_Flow', 'Ore_Pulp_Flow'], 101.941), (['Ore_Pulp_Density'], 0.0000160185), (['Flotation_Column_01_Level', 'Flotation_Column_02_Level'
                                                                                                                          ], 25.4)]
        # t raverse this lise and apply the conversions
        for conv_tuple in conversions:
            for col in conv_tuple[0]:
                predict_df[col] = pd.to_numeric(predict_df[col], errors='coerce')
                predict_df[col] = predict_df[col]*conv_tuple[1]
    except:
        pass
    # check the input arguments types are correct
    if isinstance(df, pd.DataFrame) and isinstance(predict_df, pd.DataFrame) and isinstance(output_column,
                                                                                            str) and isinstance(columns_not_required, str):
        cols_list = "".join(columns_not_required.split()).split(
            ',')  # convert text list to actual list after removing whitespace for next if statement only
        # ensure that column names provided exist in DataFrame
        if output_column in [*df.columns] and False not in [True if cc in [*df.columns] else False for cc in cols_list]:
            # create model, predict impurities, and plot results using other functions
            dict_a = linear_reg_model_creation(
                df=df, output_column=output_column, columns_not_required=columns_not_required)
            dict_b = prediction_using_model(
                model=dict_a["reg"], scaler_used=dict_a["scaler_used"], df=predict_df, regression_cols=dict_a["regression_cols"])
            dict_c = df_to_hist(
                df=dict_b['df'], data='Predicted_%_Silica_Concentrate', group_by='None', bins=5)
            dict_b['fig'] = dict_c['fig']
            return dict_b
    return {'df': None, "fig": None}

In [19]:
#single_step_create_predict(training_csv="mining_training.csv",predict_csv="Day_Input.csv", output_column='%_Silica_Concentrate', columns_not_required='date,Ore_Pulp_pH,%_Iron_Concentrate')