In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [7]:
data = pd.read_csv("titanic_train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(["PassengerId","Age"], axis=1 ),
                        data["Age"],
                        test_size=0.15, 
                        random_state=2021)

In [11]:
X_train.shape, X_test.shape

((757, 10), (134, 10))

In [12]:
y_train = np.log(y_train)
y_test = np.log(y_test)

In [13]:
categorica = [var for var in data.columns if data[var].dtype =="O"]
categorica = categorica + ["Pclass"]

In [14]:
X_train[categorica] = X_train[categorica].astype("O")
X_test[categorica]= X_test[categorica].astype("O")

In [15]:
cat_with_na = [var for var in categorica
              if X_train[var].isnull().sum()>0]

In [16]:
cat_with_na

['Cabin', 'Embarked']

In [18]:
X_train[cat_with_na].isnull().mean().sort_values(ascending = False)

Cabin       0.764861
Embarked    0.001321
dtype: float64

In [19]:
vars_with_missing_string = [var for var in cat_with_na
                           if X_train[var].isnull().mean()>0.2]

In [20]:
vars_freq_category = [var for var in cat_with_na
                           if X_train[var].isnull().mean()<=0.2]

In [21]:
X_train[vars_with_missing_string] = X_train[vars_with_missing_string].fillna("Missing")
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna("Missing")

In [24]:
for var in vars_freq_category:
    mode=X_train[var].mode()[0]
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)
    
    print(var, "____", mode)

Embarked ____ S


In [27]:
X_train[cat_with_na].isnull().mean().sort_values(ascending = False)

Embarked    0.0
Cabin       0.0
dtype: float64

In [28]:
cat_with_na = [var for var in categorica
              if X_train[var].isnull().sum()>0]

cat_with_na

[]

In [29]:
num_vars = [var for var in X_train.columns
              if var not in categorica and var !="Age"]

In [30]:
len(num_vars)

4

In [31]:
nums_with_na = [var for var in num_vars
              if X_train[var].isnull().sum()>0]

In [32]:
nums_with_na

[]

In [33]:
X_train.to_csv("preprocess_data/prep_Xtrain.csv", index=False)

In [34]:
X_test.to_csv("preprocess_data/prep_Xtest.csv", index=False)

In [35]:
y_train.to_csv("preprocess_data/prep_ytrain", index=False)

In [36]:
y_test.to_csv("preprocess_data/prep_ytest", index=False)

In [37]:
X_train = pd.read_csv("preprocess_data/prep_Xtrain.csv")
X_test = pd.read_csv("preprocess_data/prep_Xtest.csv")

In [38]:
X_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,1,1,36928,164.8667,Missing,S
1,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,1,2,SC/Paris 2123,41.5792,Missing,C
2,0,3,"Shorney, Mr. Charles Joseph",male,0,0,374910,8.05,Missing,S
3,0,3,"Van Impe, Miss. Catharina",female,0,2,345773,24.15,Missing,S
4,1,3,"Carr, Miss. Helen ""Ellen""",female,0,0,367231,7.75,Missing,Q


In [39]:
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

In [42]:

#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ["Age"]

#Variables para hacer mapeo categorico por codificación ordinal
QUAL_VARS = ["Sex", "Embarked"]

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ["Name", "Ticket","Cabin","Pclass"]

#Mapeos de variables categoricas
quality_mapping = {"female":1, "male":2, "NaN":0, "S":3, "C":4}

#Variables seleccionadas según análisis de Lasso
FEATURES = ["Name", "Ticket","Cabin","Pclass", "Sex", "Embarked"
    
]

In [43]:
X_train = X_train[FEATURES]

In [44]:
Age_pipeline = Pipeline([
    
    # Tratamiento de variables temporales
    ('eslapsed_time', mypp.TremporalVariableTransformer(
        variables=TEMPORAL_VARS, reference_variable=REF_VAR)
    ),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    # Transformación logaritmica
    ('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_quality', mypp.Mapper(
        variables=QUAL_VARS, mappings=QUAL_MAPPINGS)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
]) 

NameError: name 'mypp' is not defined