# Machine Learning Pipeline - Feature Engineering

# Paso 1: Reproducibility: Setting the seed

In [9]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for the yeo-johnson transformation
import scipy.stats as stats

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to save the trained scaler class
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

from pycaret.classification import *

ImportError: cannot import name 'interp' from 'scipy' (/opt/anaconda3/envs/drugs_env/lib/python3.9/site-packages/scipy/__init__.py)

In [32]:
# load dataset
data = pd.read_csv('./Drug.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(200, 7)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug
0,23,F,HIGH,HIGH,0.792535,0.031258,drugY
1,47,M,LOW,HIGH,0.739309,0.056468,drugC
2,47,M,LOW,HIGH,0.697269,0.068944,drugC
3,28,F,NORMAL,HIGH,0.563682,0.072289,drugX
4,61,F,LOW,HIGH,0.559294,0.030998,drugY


# Paso 2: Separamos el Dataset en entrenamiento y prueba

In [33]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Drug', axis=1), # predictive variables
    data['Drug'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((180, 6), (20, 6))

# Paso 3: Feature Engineering (Ingeniería de Características)

En las siguientes celdas, procesaremos las variables del dataset de Precios de Casas para abordar:

1. Missing values (NaN - Valores Faltantes)
2. Temporal variables (Variables Temporales)
3. Non-Gaussian distributed variables (Variables con distribución no gaussiana)
4. Categorical variables: remove rare labels (Variables categóricas: eliminar etiquetas raras)
5. Categorical variables: convert strings to numbers (Variables categóricas: convertir cadenas a números)
5. Put the variables in a similar scale (Poner las variables en una escala similar)

## Numerical variable transformation - no aplica

### Logarithmic transformation


## Variables categóricas

### Aplicamos mapeos

In [34]:
# re-map strings to numbers

disc_mappings1 = {'LOW': 0, 'NORMAL': 1, 'HIGH': 2}

disc_vars1 = ['BP']

for var in disc_vars1:
    X_train[var] = X_train[var].map(disc_mappings1)
    X_test[var] = X_test[var].map(disc_mappings1)

disc_mappings2 = {'NORMAL': 0, 'HIGH': 1}

disc_vars2 = ['Cholesterol']

for var in disc_vars2:
    X_train[var] = X_train[var].map(disc_mappings2)
    X_test[var] = X_test[var].map(disc_mappings2)

disc_mappings3 = {'M':0, 'F':1}

disc_vars3 = ['Sex']

for var in disc_vars3:
    X_train[var] = X_train[var].map(disc_mappings3)
    X_test[var] = X_test[var].map(disc_mappings3)

In [35]:
#Encoding de las variables categoricas a predecir.

disc_mappings4 = {'drugX':0, 'drugY':1, 'drugA':2, 'drugB':3,'drugC':4}
#disc_vars4 = ['Drug']

y_train = y_train.map(disc_mappings4)
y_test = y_test.map(disc_mappings4)

In [21]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

#### Verificamos ¿Hay valores nulos (na) en el conjunto de Entrenamiento? - NO

In [22]:
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

#### Verificamos ¿Hay valores nulos (na) en el conjunto de Prueba? - NO

In [23]:
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

[]

## Feature Scaling

Para su uso en modelos lineales, las características deben ser escaladas. Escalaremos las características a los valores mínimos y máximos:

In [24]:
if isinstance(X_train, pd.DataFrame):
    print("X_train es un DataFrame de pandas")
else:
    print("X_train no es un DataFrame de pandas")

X_train es un DataFrame de pandas


In [36]:
print(len(X_train))
print(X_train.shape)

180
(180, 6)


In [37]:
# Creamos el Escalador (Scaler)
scaler = MinMaxScaler()

# Ajustamos el Scaler para el Conjunto de Entrenamiento
scaler.fit(X_train)

# Transformamos el conjunto de entrenamiento y el conjunto de prueba

# sklearn devuelve arrays de numpy, así que envolvemos el
# array con un dataframe de pandas

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [27]:
X_train.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K
0,0.355932,1.0,1.0,0.0,0.189168,0.286149
1,0.779661,0.0,0.5,1.0,0.618747,0.985242
2,0.864407,1.0,0.5,0.0,0.280797,0.926781
3,0.322034,1.0,0.0,0.0,0.821884,0.733845
4,0.389831,1.0,0.0,0.0,0.249021,0.000335


In [28]:
# Guardemos ahora los conjuntos de entrenamiento y prueba para el próximo Notebook!

X_train.to_csv('./OutputFeaturEngDrugs/xtrain.csv', index=False)
X_test.to_csv('./OutputFeaturEngDrugs/xtest.csv', index=False)

y_train.to_csv('./OutputFeaturEngDrugs/ytrain.csv', index=False)
y_test.to_csv('./OutputFeaturEngDrugs/ytest.csv', index=False)

In [29]:
# Ahora guardamos el Scaler

joblib.dump(scaler, './OutputFeaturEngDrugs/minmax_scaler.joblib')

['./OutputFeaturEngDrugs/minmax_scaler.joblib']

In [38]:
# to build the model
from sklearn.linear_model import Lasso

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

# set up the model
# remember to set the random_state / seed

lin_model = Lasso(alpha=0.001, random_state=0)

# train the model

lin_model.fit(X_train, y_train)

# make predictions for train set
pred = lin_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = lin_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average house price: ', int(np.exp(y_train).median()))

train mse: 217
train rmse: 14
train r2: -0.053932519736090256

test mse: 260
test rmse: 16
test r2: -0.028381590332139606

Average house price:  2




In [40]:
y_test.reset_index(drop=True)

0     4
1     0
2     1
3     1
4     1
5     0
6     0
7     0
8     1
9     0
10    2
11    1
12    1
13    1
14    3
15    4
16    1
17    1
18    0
19    1
Name: Drug, dtype: int64