simple model flow

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [3]:
maintanence_df = pd.read_csv('../data/Milling-machine-ai4i2020.csv')

Configs.

In [4]:
NUMERICAL = ['Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
ORDINAL = ['Type']
FEATURES = NUMERICAL + ORDINAL
TARGET = ['Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

MODEL_PATH = "../models/rfmodel.joblib"
SCALER_PATH = "../models/scaler.joblib"
LENCODER_PATH = "../models/lencoder.joblib"

Processing

In [5]:
def selected_split(data: pd.DataFrame) -> pd.DataFrame:
    X = data[FEATURES]
    y = data[TARGET]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [6]:
def scale(data: pd.DataFrame, to_train: bool) -> np.ndarray:
    if to_train:
        scaler = StandardScaler()
        fitted_scaler = scaler.fit(data[NUMERICAL])
        joblib.dump(fitted_scaler, SCALER_PATH)
        scaled_set = fitted_scaler.transform(data[NUMERICAL])
    else:
        joblib_scaler = joblib.load(SCALER_PATH)
        scaled_set = joblib_scaler.transform(data[NUMERICAL])
    return scaled_set

In [7]:
X = maintanence_df[FEATURES]
y = maintanence_df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
l_encoder = LabelEncoder()

In [9]:
fitted_scaler = scaler.fit(X_train[NUMERICAL])
joblib.dump(fitted_scaler, SCALER_PATH)
scaled_set = fitted_scaler.transform(X_train[NUMERICAL])

In [10]:
def to_oned(data: pd.DataFrame)-> np.ndarray:
    nparr = np.array(data)
    oneD = nparr.ravel()
    return oneD

In [11]:
var = np.array(X_train[ORDINAL])
newvar = var.ravel()
newvar

array(['L', 'L', 'L', ..., 'H', 'H', 'L'], dtype=object)

In [12]:
to_oned(X_train[ORDINAL])

array(['L', 'L', 'L', ..., 'H', 'H', 'L'], dtype=object)

In [13]:
def lencode(data: pd.DataFrame, to_train: bool) -> np.ndarray:
    if to_train:
        l_encoder = LabelEncoder()
        fitted_lencoder = l_encoder.fit(data[ORDINAL])
        joblib.dump(fitted_lencoder, LENCODER_PATH)
        encoded_set = fitted_lencoder.transform(data[ORDINAL])
    else:
        joblib_encoder = joblib.load(LENCODER_PATH)
        encoded_set = joblib_encoder.transform(data[ORDINAL])
    return encoded_set

In [14]:
fitted_lcoder = l_encoder.fit(newvar)
joblib.dump(fitted_lcoder,LENCODER_PATH)
labelencoded_set = fitted_lcoder.transform(newvar)

In [15]:
scaled_df = pd.DataFrame(scaled_set, columns=NUMERICAL)
scaled_df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,-0.854066,-0.609589,0.427634,-0.892696,1.375035
1,-0.904014,-1.080528,-0.834945,1.382187,0.45762
2,-0.904014,-1.48419,-0.059677,-0.892696,1.359218
3,0.444571,0.534121,0.333495,-0.702288,-1.598655
4,0.694309,0.33229,0.178441,-0.612094,1.580663


In [18]:
labeled_df = pd.DataFrame(labelencoded_set, columns=ORDINAL)
labeled_df.head()

Unnamed: 0,Type
0,1
1,1
2,1
3,2
4,1


In [19]:
def preprocessor(data: pd.DataFrame, to_train: bool) -> pd.DataFrame:
    numerical_values = scale(data, to_train)
    categorical_values = lencode(data, to_train)
    scaled_df = pd.DataFrame(numerical_values, columns=NUMERICAL)
    labeled_df = pd.DataFrame(categorical_values, columns=ORDINAL)
    processed_data = pd.concat([scaled_df, labeled_df], axis=1)
    return processed_data

In [20]:
processed_df = pd.concat([scaled_df, labeled_df], axis=1)
processed_df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type
0,-0.854066,-0.609589,0.427634,-0.892696,1.375035,1
1,-0.904014,-1.080528,-0.834945,1.382187,0.45762,1
2,-0.904014,-1.48419,-0.059677,-0.892696,1.359218,1
3,0.444571,0.534121,0.333495,-0.702288,-1.598655,2
4,0.694309,0.33229,0.178441,-0.612094,1.580663,1


In [21]:
y_valuses = y_train[TARGET].values
y_train_v = y_valuses.ravel()
y_train_v

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
print(type(to_oned(y_train[TARGET])))


<class 'numpy.ndarray'>


In [23]:
def predict(X: pd.DataFrame) -> np.ndarray: 
    model = joblib.load(MODEL_PATH)
    predictions = model.predict(X)
    predictions = np.around(predictions, 3)
    proba = model.predict_proba(X)
    return predictions, proba

In [24]:
def compute_accuracy(y_test: pd.DataFrame, y_pred:pd.DataFrame)-> dict:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    scores = {'Accuracy':accuracy, 'Precision':precision, 'Recall':recall, 'Auc':auc}
    return scores

train

In [25]:
def model_fit(X_train: pd.DataFrame, y_train: np.ndarray) -> None:
    model = RandomForestClassifier(random_state=42)
    multi_target_model = MultiOutputClassifier(model)
    fitted_model = multi_target_model.fit(X_train, y_train)
    joblib.dump(fitted_model, MODEL_PATH)
    return None

In [23]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(processed_df, y_train_v)
joblib.dump(rf_model,MODEL_PATH)

['../models/rfmodel.joblib']

In [26]:
def model_train(data: pd.DataFrame) -> pd.DataFrame:
    X_train, X_test, y_train, y_test = selected_split(data)
    X_train_processed = preprocessor(X_train, True)
    # y_flaten = to_oned(y_train)
    y_train_flattened = y_train.values  
    # model_fit(X_train_processed, y_flaten)
    model_fit(X_train_processed, y_train_flattened)
    return X_test, y_test

Evalutaion 

In [27]:
def model_eval(X_test: pd.DataFrame, y_test: np.ndarray) -> dict:
    X_test_processed = preprocessor(X_test, False)
    predictions_test = predict(X_test_processed)
    # y_flaten = to_oned(y_test)
    y_test_flattened = y_test.values  
    # scores = compute_accuracy(y_flaten, predictions_test)
    scores = compute_accuracy(y_test_flattened, predictions_test)
    return scores

In [28]:
def build_model(data: pd.DataFrame) -> dict:
    X_test, y_test = model_train(data)
    model_score = model_eval(X_test, y_test)
    return model_score

In [29]:
X_test.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type
6252,300.8,310.3,1538,36.1,198,L
4684,303.6,311.8,1421,44.8,101,M
1731,298.3,307.9,1485,42.0,117,M
4742,303.3,311.3,1592,33.7,14,L
4521,302.4,310.4,1865,23.9,129,L


In [30]:
loaded_model = joblib.load(MODEL_PATH)
loaded_scaler = joblib.load(SCALER_PATH)
loaded_lencoder = joblib.load(LENCODER_PATH)

In [31]:
num_scaled = loaded_scaler.transform(X_test[NUMERICAL])
X_scaled_df = pd.DataFrame(num_scaled,columns=NUMERICAL)

In [32]:
X_ordvalues = np.array(X_test[ORDINAL])
X_test_v = X_ordvalues.ravel()
X_test_v

array(['L', 'M', 'M', ..., 'L', 'L', 'M'], dtype=object)

In [30]:
to_oned(X_test[ORDINAL])

array(['L', 'M', 'M', ..., 'L', 'L', 'M'], dtype=object)

In [33]:
ord_labelled = loaded_lencoder.transform(X_test[ORDINAL])
X_ord_df = pd.DataFrame(ord_labelled,columns=ORDINAL)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [34]:
X_processed = pd.concat([X_scaled_df, X_ord_df], axis=1)
X_processed.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type
0,0.394623,0.197736,-0.004301,-0.391621,1.422488,1
1,1.793155,1.206891,-0.652203,0.480251,-0.11181,2
2,-0.854066,-1.416913,-0.297795,0.199649,0.14127,2
3,1.643313,0.870506,0.294731,-0.632137,-1.487933,1
4,1.193784,0.265013,1.806503,-1.614245,0.33108,1


In [35]:
y_pred = loaded_model.predict(X_processed)

In [36]:
print(type(y_pred))

<class 'numpy.ndarray'>


In [37]:
y_test_val = y_test[TARGET].values
y_test_v = y_test_val.ravel()
y_test_v

array([0, 0, 0, ..., 0, 0, 0])

In [36]:
evaluation_scores = compute_accuracy(y_test_v,y_pred)
evaluation_scores

{'Accuracy': 0.985,
 'Precision': 0.8604651162790697,
 'Recall': 0.6065573770491803,
 'Auc': 0.8017314992517691}