#### Importación de librerias 

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

###### ======  Librerias para ingenieria de caracteristicas  ====== ######
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import myPreprocessors as mypp #nuestras librerías de transformaciones.

import joblib

In [2]:
dataTrain = pd.read_csv("train.csv")
dataTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataTrain.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
dataTrain.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

#### 2. Train Test Split para Entrenamiento y Prueba

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    dataTrain.drop(['PassengerId', 'Survived'], axis=1),
    dataTrain['Survived'],
    test_size=0.3,
    random_state=2022)

#### 3. Configuración del Pipeline (Proceso de ingenieria de caracteristicas)

In [6]:
#imputación de variables categóricas
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Embarked']

#Imputación de variables numéricas
NUMERICAL_VARS_WITH_NA = ['Age']

#Variables que eliminaremos
DROP_FEATURES = ['Name', 'Ticket', 'Cabin']

#Variables para transfomación logarítmica
NUMERICAL_LOG_VARS = ["Age"]

#Variables para codificación ordinal. (se vuelve a numero en base a orden que aparece)
QUAL_VARS = ['Sex']

EXPOSURE_VARS = ['Embarked']


#Mapeo para varibels categótricas para calidad.
QUAL_MAPPINGS = {'male': 0, 'female': 1}

EXPOSURE_MAPPINGS = {'S': 1, 'C': 2, 'Q': 3}

#Variables a utilzar en el entrenamiento
FEATURES = ['Pclass',	'Sex',	'Age',	'SibSp',	'Parch',	'Fare',	'Embarked', 'Name', 'Ticket', 'Cabin']



In [7]:
X_train = X_train[FEATURES]

#### 4. Construcción del Pipeline

In [8]:
##### ====== Se usa la libreria Feature Engine y nuestras librerías de transformaciones. ====== ##### 

housePrice_pipeline_v112022 = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    #2. Imputación de variables categóricas basada en frecuencia
    ('frequent_imputation',
        CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicador faltane en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #4. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #========== VARIABLES TEMPORALES ================
      
    #6. Drop de variables.
    ('drop_time_features',
        DropFeatures(features_to_drop=DROP_FEATURES)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS ORINALES ==================
    ('quality_mapper',
        mypp.Mapper(variables=QUAL_VARS, mappings=QUAL_MAPPINGS)
    ),
    
    ('exposure_mapper',
       mypp.Mapper(variables=EXPOSURE_VARS, mappings=EXPOSURE_MAPPINGS)
    ),

    #=============== TRANSFORMACIÓN DE VARIABLES CONTINUAS ============
    ('log_transformer',
        LogTransformer(variables=NUMERICAL_LOG_VARS)
    ),
    
   
     #=============== SCALER ============
    ('scaler',
        MinMaxScaler()
    ),
    
    ('modelo_SVC', 
         SVC(kernel='linear', C=1)
    )
])

In [9]:
##### ====== Aplicación de los cambios de ingenieria de caracteristicas ====== ##### 
housePrice_pipeline_v112022.fit(X_train, y_train)

Pipeline(steps=[('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator_numeric',
                 AddMissingIndicator(variables=['Age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age'])),
                ('drop_time_features',
                 DropFeatures(features_to_drop=['Name', 'Ticket', 'Cabin'])),
                ('quality_mapper',
                 Mapper(mappings={'female': 1, 'male': 0}, variables=['Sex'])),
                ('exposure_mapper',
                 Mapper(mappings={'C': 2, 'Q': 3, 'S': 1},
                        variables=['Embarked'])),
                ('log_transformer', LogTransformer(variables=['Age'])),
                ('scaler', MinMaxScaler()),
                ('modelo_lasso', SVC(C=1, kernel='linear'))])

In [10]:
X_test = X_test[FEATURES]

#### <b> Predicciones </b>

In [11]:
preds = housePrice_pipeline_v112022.predict(X_test)

In [12]:
print('Accuracy: ', accuracy_score(y_test, preds))

Accuracy:  0.7985074626865671


In [13]:
#guardamos pipeline para uso en producción.
joblib.dump(housePrice_pipeline_v112022, 'housePrice_pipeline_v12022.pkl')

['housePrice_pipeline_v12022.pkl']

In [14]:
joblib.dump(FEATURES, 'FEATURES.pkl')

['FEATURES.pkl']

## <b> Video </b>

https://youtu.be/aAm1jWsG_gI