In [2]:
!pip install sklearn pandas numpy -q

In [3]:
from sklearn.datasets import fetch_openml

In [5]:
X, y = fetch_openml("titanic",version=1,as_frame=True,return_X_y = True)

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=.2, 
                                                    random_state=200)

In [37]:
# Define categorical columns
categorical = list(X_train.select_dtypes('category').columns)
print(f"Categorical columns are: {categorical}")

# Define numerical columns
numerical = list(X_train.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical}")

Categorical columns are: ['sex', 'embarked']
Numerical columns are: ['pclass', 'age', 'sibsp', 'parch', 'fare', 'body']


In [38]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', MinMaxScaler())])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer(transformers=[('cat', cat_pipe, categorical),
                                               ('num', num_pipe, numerical)])

# Fit a pipeline with transformers and an estimator to the training data
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', RandomForestClassifier())])
pipe.fit(X_train, y_train)

# Predict training data
y_train_pred = pipe.predict(X_train)

# Predict test data
y_test_pred = pipe.predict(X_test)


In [39]:
scores = cross_val_score(pipe, X, y, cv=5)

In [40]:
scores

array([0.50381679, 0.75954198, 0.69465649, 0.6870229 , 0.64367816])

In [43]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_test_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, y_test_pred))
print('\n')
print("=== All AUC Scores ===")
print(scores)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ",scores.mean())

=== Confusion Matrix ===
[[138  23]
 [ 36  65]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       161
           1       0.74      0.64      0.69       101

    accuracy                           0.77       262
   macro avg       0.77      0.75      0.76       262
weighted avg       0.77      0.77      0.77       262



=== All AUC Scores ===
[0.50381679 0.75954198 0.69465649 0.6870229  0.64367816]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.6577432657716943
