# Machine Learning Pipeline - Feature Engineering

# Paso 1: Reproducibility: Setting the seed

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for the yeo-johnson transformation
import scipy.stats as stats

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to save the trained scaler class
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

from pycaret.classification import *

In [2]:
# load dataset
data = pd.read_csv('./Drug.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(200, 7)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug
0,23,F,HIGH,HIGH,0.792535,0.031258,drugY
1,47,M,LOW,HIGH,0.739309,0.056468,drugC
2,47,M,LOW,HIGH,0.697269,0.068944,drugC
3,28,F,NORMAL,HIGH,0.563682,0.072289,drugX
4,61,F,LOW,HIGH,0.559294,0.030998,drugY


# Paso 2: Separamos el Dataset en entrenamiento y prueba

In [49]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Drug', axis=1), # predictive variables
    data['Drug'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
shuffle=False)

X_train.shape, X_test.shape
print(X_train)
y_train= pd.DataFrame(y_train)
y_train.columns =['Drug']
y_test= pd.DataFrame(y_test)
y_test.columns =['Drug']
print(y_test)

     Age Sex      BP Cholesterol        Na         K
0     23   F    HIGH        HIGH  0.792535  0.031258
1     47   M     LOW        HIGH  0.739309  0.056468
2     47   M     LOW        HIGH  0.697269  0.068944
3     28   F  NORMAL        HIGH  0.563682  0.072289
4     61   F     LOW        HIGH  0.559294  0.030998
..   ...  ..     ...         ...       ...       ...
175   73   F    HIGH        HIGH  0.808019  0.044038
176   48   M    HIGH      NORMAL  0.769197  0.073633
177   25   M  NORMAL        HIGH  0.775702  0.040803
178   39   M  NORMAL        HIGH  0.609566  0.038171
179   67   F  NORMAL        HIGH  0.785251  0.049416

[180 rows x 6 columns]
      Drug
180  drugY
181  drugX
182  drugX
183  drugY
184  drugY
185  drugY
186  drugB
187  drugA
188  drugY
189  drugY
190  drugY
191  drugA
192  drugY
193  drugC
194  drugY
195  drugC
196  drugC
197  drugX
198  drugX
199  drugX


# Paso 3: Feature Engineering (Ingeniería de Características)

En las siguientes celdas, procesaremos las variables del dataset de Precios de Casas para abordar:

1. Missing values (NaN - Valores Faltantes)
2. Temporal variables (Variables Temporales)
3. Non-Gaussian distributed variables (Variables con distribución no gaussiana)
4. Categorical variables: remove rare labels (Variables categóricas: eliminar etiquetas raras)
5. Categorical variables: convert strings to numbers (Variables categóricas: convertir cadenas a números)
5. Put the variables in a similar scale (Poner las variables en una escala similar)

## Numerical variable transformation - no aplica

### Logarithmic transformation


## Variables categóricas

### Aplicamos mapeos

In [50]:
# re-map strings to numbers

disc_mappings1 = {'LOW': 0, 'NORMAL': 1, 'HIGH': 2}

disc_vars1 = ['BP']

for var in disc_vars1:
    X_train[var] = X_train[var].map(disc_mappings1)
    X_test[var] = X_test[var].map(disc_mappings1)

disc_mappings2 = {'NORMAL': 0, 'HIGH': 1}

disc_vars2 = ['Cholesterol']

for var in disc_vars2:
    X_train[var] = X_train[var].map(disc_mappings2)
    X_test[var] = X_test[var].map(disc_mappings2)

disc_mappings3 = {'M':0, 'F':1}

disc_vars3 = ['Sex']

for var in disc_vars3:
    X_train[var] = X_train[var].map(disc_mappings3)
    X_test[var] = X_test[var].map(disc_mappings3)

    #Encoding de las variables categoricas a predecir.

disc_mappings4 = {'drugX':0, 'drugY':1, 'drugA':2, 'drugB':3,'drugC':4}
disc_vars4 = ['Drug']

for var in disc_vars4:
    y_train[var] = y_train[var].map(disc_mappings4)
    y_test[var] = y_test[var].map(disc_mappings4)


In [74]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

#### Verificamos ¿Hay valores nulos (na) en el conjunto de Entrenamiento? - NO

In [75]:
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

#### Verificamos ¿Hay valores nulos (na) en el conjunto de Prueba? - NO

In [76]:
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

[]

## Feature Scaling

Para su uso en modelos lineales, las características deben ser escaladas. Escalaremos las características a los valores mínimos y máximos:

In [6]:
if isinstance(y_train, pd.DataFrame):
    print("X_train es un DataFrame de pandas")
else:
    print("X_train no es un DataFrame de pandas")

X_train es un DataFrame de pandas


In [52]:
print(len(X_train))
print(X_train.shape)

print(len(y_train))
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

180
(180, 6)
180
(180, 1)
(20, 6)
(20, 1)


In [6]:
# Creamos el Escalador (Scaler)
scaler = MinMaxScaler()

# Ajustamos el Scaler para el Conjunto de Entrenamiento
scaler.fit(X_train)

# Transformamos el conjunto de entrenamiento y el conjunto de prueba

# sklearn devuelve arrays de numpy, así que envolvemos el
# array con un dataframe de pandas

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [54]:
# X_train.head()
#print(X_train.shape)

# X_train
# y_train
print([X_train, y_train])

dataTrain= pd.concat([X_train, y_train], axis=1)
dataTrain
print(y_train.shape)
dataTest= pd.concat([X_test, y_test], axis=1)
print(y_test)
print("dataTest")
print(dataTest)

[     Age  Sex  BP  Cholesterol        Na         K
0     23    1   2            1  0.792535  0.031258
1     47    0   0            1  0.739309  0.056468
2     47    0   0            1  0.697269  0.068944
3     28    1   1            1  0.563682  0.072289
4     61    1   0            1  0.559294  0.030998
..   ...  ...  ..          ...       ...       ...
175   73    1   2            1  0.808019  0.044038
176   48    0   2            0  0.769197  0.073633
177   25    0   1            1  0.775702  0.040803
178   39    0   1            1  0.609566  0.038171
179   67    1   1            1  0.785251  0.049416

[180 rows x 6 columns],      Drug
0       1
1       4
2       4
3       0
4       1
..    ...
175     1
176     2
177     1
178     1
179     1

[180 rows x 1 columns]]
(180, 1)
     Drug
180     1
181     0
182     0
183     1
184     1
185     1
186     3
187     2
188     1
189     1
190     1
191     2
192     1
193     4
194     1
195     4
196     4
197     0
198     0
199     

In [55]:
# Guardemos ahora los conjuntos de entrenamiento y prueba para el próximo Notebook!

X_train.to_csv('./OutputFeaturEngDrugs/xtrain.csv', index=False)
X_test.to_csv('./OutputFeaturEngDrugs/xtest.csv', index=False)

y_train.to_csv('./OutputFeaturEngDrugs/ytrain.csv', index=False)
y_test.to_csv('./OutputFeaturEngDrugs/ytest.csv', index=False)

In [90]:
# leemos ahora los conjuntos de entrenamiento y prueba para el próximo Notebook!

X_train= pd.read_csv('./OutputFeaturEngDrugs/xtrain.csv')
X_test= pd.read_csv('./OutputFeaturEngDrugs/xtest.csv')

y_train = pd.read_csv('./OutputFeaturEngDrugs/ytrain.csv')
y_test = pd.read_csv('./OutputFeaturEngDrugs/ytest.csv')

In [56]:
# Ahora guardamos el Scaler

joblib.dump(scaler, './OutputFeaturEngDrugs/minmax_scaler.joblib')

['./OutputFeaturEngDrugs/minmax_scaler.joblib']

In [73]:
from pycaret.classification import setup

# Setting up the PyCaret environment with the training data and generating a profile report
clf1 = setup(data = dataTrain, target = 'Drug')

#Comparing all models 
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,2798
1,Target,Drug
2,Target type,Multiclass
3,Original data shape,"(180, 7)"
4,Transformed data shape,"(180, 7)"
5,Transformed train set shape,"(125, 7)"
6,Transformed test set shape,"(55, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9455,0.8937,0.9455,0.9547,0.9434,0.9213,0.9261,0.771
rf,Random Forest Classifier,0.9359,0.8915,0.9359,0.9096,0.918,0.9045,0.9117,0.024
gbc,Gradient Boosting Classifier,0.9135,0.0,0.9135,0.9159,0.9068,0.8765,0.885,0.045
dt,Decision Tree Classifier,0.8974,0.8313,0.8974,0.8885,0.8864,0.8504,0.8579,0.004
et,Extra Trees Classifier,0.8885,0.8856,0.8885,0.8593,0.8649,0.8344,0.8449,0.019
lda,Linear Discriminant Analysis,0.8558,0.0,0.8558,0.8439,0.8313,0.7878,0.8037,0.004
ada,Ada Boost Classifier,0.6827,0.0,0.6827,0.755,0.6774,0.5581,0.5842,0.011
qda,Quadratic Discriminant Analysis,0.6801,0.0,0.6801,0.7521,0.6784,0.5727,0.6025,0.004
nb,Naive Bayes,0.6788,0.7923,0.6788,0.7995,0.6819,0.5878,0.6267,0.004
ridge,Ridge Classifier,0.6333,0.0,0.6333,0.5617,0.568,0.4039,0.4537,0.007


In [74]:
evaluate_model(best)
#Picking the winner 
#best_model = automl(optimize = 'Accuracy')







interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8462,0.9499,0.8462,0.7846,0.812,0.7658,0.7732
1,0.9231,1.0,0.9231,0.8718,0.8923,0.8889,0.8967
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.8462,0.9716,0.8462,0.8462,0.8462,0.7797,0.7797
4,0.8462,0.9923,0.8462,0.7802,0.8107,0.7699,0.7779
5,0.9167,0.0,0.9167,0.9286,0.9115,0.8681,0.8786
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.9167,1.0,0.9167,0.875,0.8889,0.88,0.8889
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [77]:
#Fine-tuning the best model
#tuned_best_model = tune_model(best,n_iter=10, choose_better=True)

tuned_best_model


In [75]:

predictions = predict_model(tuned_best_model, data = dataTest)

predictions



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.95,0.9955,0.95,0.9667,0.9535,0.9303,0.9338


Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug,prediction_label,prediction_score
180,22,1,2,0,0.817625,0.035832,1,1,0.9998
181,59,1,1,1,0.882486,0.063563,0,0,0.9649
182,20,1,0,0,0.811023,0.069402,0,0,0.9942
183,36,1,2,0,0.575058,0.037124,1,2,0.8154
184,18,1,2,1,0.88515,0.023802,1,1,1.0
185,57,1,1,0,0.551967,0.021317,1,1,0.9942
186,70,0,2,1,0.589493,0.059854,3,3,0.9626
187,47,0,2,1,0.56332,0.054152,2,2,0.9945
188,65,0,2,0,0.8645,0.024702,1,1,0.9988
189,64,0,2,0,0.739914,0.035349,1,1,0.9653


In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions['Drug'], predictions['prediction_label'])

0.95

In [78]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

print("Loading data...")
# load or create your dataset
#regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
#df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
#df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")

#y_train = df_train[0]
#y_test = df_test[0]
#X_train = df_train.drop(0, axis=1)
#X_test = df_test.drop(0, axis=1)

print("Starting training...")
# train
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.1, n_estimators=100)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])


Loading data...
Starting training...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6
[LightGBM] [Info] Start training from score 1.227778
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[37]	valid_0's l1: 0.482751	valid_0's l2: 0.458553


In [79]:

print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)


Starting predicting...


In [80]:


# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f"The RMSE of prediction is: {rmse_test}")

# feature importances
print(f"Feature importances: {list(gbm.feature_importances_)}")



The RMSE of prediction is: 0.6771655395821039
Feature importances: [34, 6, 49, 34, 35, 51]


In [81]:

# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print("Starting training with custom eval function...")
# train
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])



Starting training with custom eval function...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6
[LightGBM] [Info] Start training from score 1.227778
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[37]	valid_0's l2: 0.458553	valid_0's RMSLE: 0.24734


In [82]:

# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False


print("Starting training with multiple custom eval functions...")
# train
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])


Starting training with multiple custom eval functions...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 180, number of used features: 6
[LightGBM] [Info] Start training from score 1.227778
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[37]	valid_0's l2: 0.458553	valid_0's RMSLE: 0.24734	valid_0's RAE: 0.455426


In [103]:

print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
y_test2= y_test['Drug'].values
print(y_test2)
print(y_pred)


Starting predicting...
[1 0 0 1 1 1 3 2 1 1 1 2 1 4 1 4 4 0 0 0]
[ 1.09361201  0.39564262  0.54508636  1.17395992  0.94341987  0.65485806
  2.84116545  1.56238218  1.18138262  1.29126757  1.34544712  2.4190049
  1.94090921  2.5453325   1.05073327  2.16404419  2.91337996  0.24453127
 -0.04147607  0.55655582]


In [105]:

# eval
rmsle_test = rmsle(y_test2, y_pred)[1]
#rmsle_test = rmsle(1, 1.1)[1]
#rae_test = rae(y_test, y_pred)[1]
print(f"The RMSLE of prediction is: {rmsle_test}")
#print(f"The RAE of prediction is: {rae_test}")

The RMSLE of prediction is: 0.24733956365219203


In [None]:
# to build the model
from sklearn.linear_model import Lasso

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# set up the model
# remember to set the random_state / seed

#lin_model = Lasso(alpha=0.001, random_state=0)

# train the model

#lin_model.fit(X_train, y_train)




# make predictions for train set
pred = lin_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = lin_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average house price: ', int(np.exp(y_train).median()))

train mse: 217
train rmse: 14
train r2: -0.053932519736090256

test mse: 260
test rmse: 16
test r2: -0.028381590332139606

Average house price:  2




In [None]:
y_test.reset_index(drop=True)

0     4
1     0
2     1
3     1
4     1
5     0
6     0
7     0
8     1
9     0
10    2
11    1
12    1
13    1
14    3
15    4
16    1
17    1
18    0
19    1
Name: Drug, dtype: int64