In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (OneHotEncoder, MinMaxScaler)
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score

from feature_engine.encoding import (OrdinalEncoder)
from sklearn.svm import SVC, LinearSVC

from sklearn.pipeline import Pipeline

from feature_engine.imputation import (AddMissingIndicator, MeanMedianImputer, CategoricalImputer)
from feature_engine.encoding import (RareLabelEncoder, OrdinalEncoder)
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# 1) Cargamos el data train y test

In [2]:
data = pd.read_csv('train.csv')
dataX = data.drop('Survived', axis=1)
dataY = data['Survived']

dataTest = pd.read_csv('test.csv')


# 2) Configuracion del machine learning pipeline

In [3]:
DROP_FEATURES = ['Cabin', 'Name', 'Ticket']
MEAN_IMPUTATION = ['Age', 'Fare']
MISSING_IMPUTATION = ['Embarked']
CATEGORICAL_BINARY = ['Sex']
CATEGORICAL_ORDINAL = ['Pclass']
CATEGORICAL_NOMINAL = ['Embarked']
NUMERICALS_YEO_JOHNSON = ['Age', 'Fare']
FEATURES = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# 3) Creamos el machine learning pipeline

In [8]:
titanicPipeline = Pipeline([
    #- Drop de variables
#     ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
    
    
    #==== IMPUTACIONES ====#
    #- Imputacion de variables categoricas
    ('missing_imputation', CategoricalImputer(imputation_method='missing', variables=MISSING_IMPUTATION)),
    
    #- Imputacion de media para variables categoricas
    ('mean_imputation', MeanMedianImputer(imputation_method='mean', variables=MEAN_IMPUTATION)),    

    
    #==== TRANSFORMACION DE VARIABLES NUMERICAS ====#
    #- Transformacion de Yeo Johnson
    ('Yeo Johnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_JOHNSON)),
    
    #==== CODIFICACION DE VARIABLES ====#
    #- Categoricas binarias
    ('categorical_binary_encoder', OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_BINARY)),

    #- Categoricas Nominales
    ('categorical_encoder', OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_NOMINAL)),
    
    #==== Escalado ====# 
    ('scaler', MinMaxScaler()),
    
    #==== Entrenamiento del modelo con Lasso ====#
    ('SVC', SVC()),
])

dataX = dataX[FEATURES]

In [10]:
titanicPipeline.fit(dataX, dataY)
dataTest = dataTest[FEATURES]

preds = titanicPipeline.predict(dataTest)
joblib.dump(titanicPipeline, 'titanicPipeline.pkl')

['titanicPipeline.pkl']