In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [36]:
train_data = pd.read_csv("./datasets/titanic/train.csv")
test_data = pd.read_csv("./datasets//titanic/test.csv")
# Examine the training data
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [37]:
categorical_vars = ['Pclass','Name','Sex','Ticket','Cabin','Embarked']
numerical_vars = ['Age','SibSp','Parch','Fare']
feature = ['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']
X_ = train_data[feature]
y = train_data["Survived"]
X_test_ = test_data[feature]

In [38]:
# Extract title from the 'Name' column
X_.loc[:,'Title'] = X_.loc[:,'Name'].str.split(',').str[1].str.split('.').str[0].copy()
X_test_.loc[:,'Title'] = X_test_.loc[:,'Name'].str.split(',').str[1].str.split('.').str[0].copy()
X_['Title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_.loc[:,'Title'] = X_.loc[:,'Name'].str.split(',').str[1].str.split('.').str[0].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_.loc[:,'Title'] = X_test_.loc[:,'Name'].str.split(',').str[1].str.split('.').str[0].copy()


Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Ms                1
Mme               1
Don               1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

In [39]:
X_ = X_.copy()
X_test_ = X_test_.copy()
# Agrupar títulos raros
for df in [X_, X_test_]:
    df.loc[~df['Title'].isin([' Mr', ' Mrs', ' Miss', ' Master']), 'Title'] = 'Other'

# Eliminar la columna 'Name'
X_ = X_.drop(columns=['Name'])
X_test_ = X_test_.drop(columns=['Name'])


In [40]:
print(X_test_.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         417 non-null    float64
 7   Embarked     418 non-null    object 
 8   Title        418 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB
None


In [41]:
# Specify the categorical and numerical columns
categorical_cols = [cname for cname in X_ if
        X_[cname].dtype == "object"]

numerical_cols = [cname for cname in X_ if 
        X_[cname].dtype in ['int64', 'float64', 'uint8']]

cols = categorical_cols + numerical_cols
X = X_[cols].copy()
X_test = X_test_[cols].copy()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
    ])

In [42]:
# Define a Random Forest model using some default parameters
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X, y)


# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

In [43]:
def get_gini(X, my_pipeline=my_pipeline, numerical_cols=numerical_cols):
        
        "Calculates Gini importance of each feature"
        
        cat_features = my_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)

        num_features = numerical_cols
        feature_list = np.append(num_features, cat_features)
        imps = my_pipeline.steps[1][1].feature_importances_
        print("Gini importance of each feature:\n")
        for i in zip(feature_list, imps):
            print(i)


def get_cvs(my_pipeline, X, y):
    
        "Calculates Accuracy score for a list of features"
        
        cvs = 1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='accuracy')

        print('\nAccuracy score: ', feature, cvs.mean(), '\n')

In [44]:
get_gini(X)
get_cvs(my_pipeline, X, y)

Gini importance of each feature:

('PassengerId', np.float64(0.042840400892992285))
('Pclass', np.float64(0.11678843614693533))
('Age', np.float64(0.05432331533135518))
('SibSp', np.float64(0.04146719365493314))
('Parch', np.float64(0.020968146202611906))
('Fare', np.float64(0.09036919293067613))
('Sex_female', np.float64(0.1915391176334786))
('Sex_male', np.float64(0.1436959758930339))
('Embarked_C', np.float64(0.011417513842479417))
('Embarked_Q', np.float64(0.007242775549603224))
('Embarked_S', np.float64(0.00960207484967599))
('Title_ Master', np.float64(0.016704672754957305))
('Title_ Miss', np.float64(0.03895350657974646))
('Title_ Mr', np.float64(0.17785740133929842))
('Title_ Mrs', np.float64(0.028865304799823667))
('Title_Other', np.float64(0.007364971598399196))

Accuracy score:  ['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] 0.824913690289373 

