In [474]:
import pandas as pd
import numpy as np

In [475]:
df_train = pd.read_csv('.//train.csv', index_col="PassengerId")
df_test = pd.read_csv('.//test.csv', index_col="PassengerId")


In [476]:
df_train['Name']

PassengerId
1                                Braund, Mr. Owen Harris
2      Cumings, Mrs. John Bradley (Florence Briggs Th...
3                                 Heikkinen, Miss. Laina
4           Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                               Allen, Mr. William Henry
                             ...                        
887                                Montvila, Rev. Juozas
888                         Graham, Miss. Margaret Edith
889             Johnston, Miss. Catherine Helen "Carrie"
890                                Behr, Mr. Karl Howell
891                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

# Feature Engineering

In [477]:
df_train['Title'] = df_train['Name'].apply(lambda x : x.split(', ')[1].split('.')[0])
df_test['Title'] = df_test['Name'].apply(lambda x : x.split(', ')[1].split('.')[0])

df_train['Title']

df_train.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'Title'],
      dtype='object')

# Filling up Nan's

In [478]:
# df_train.drop(columns=['Name'], inplace=True)
# df_test.drop(columns=['Name'], inplace=True)

In [479]:
print(df_train[df_train['Age'].isna()]['Age'])

df_train["Title"] = df_train["Title"].replace({
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",  # Convert French titles
    "Lady": "Mrs", "Countess": "Mrs", "Dona": "Mrs",
    "Sir": "Mr", "Don": "Mr", "Jonkheer": "Mr"
})

PassengerId
6     NaN
18    NaN
20    NaN
27    NaN
29    NaN
       ..
860   NaN
864   NaN
869   NaN
879   NaN
889   NaN
Name: Age, Length: 177, dtype: float64


In [486]:
mean_age_married = df_train[df_train['Title'].isin(['Mr', 'Mrs'])]['Age'].dropna().mean()
mean_age_unmarried = df_train[df_train['Title'].isin(['Miss'])]['Age'].dropna().mean()
print(mean_age_married, mean_age_unmarried)

df_train["Age"] = df_train.groupby("Title")["Age"].transform(lambda x: x.fillna(x.mean()))

df_train["Age"]


33.12128989735861 21.845637583892618


PassengerId
1      22.000000
2      38.000000
3      26.000000
4      35.000000
5      35.000000
         ...    
887    27.000000
888    19.000000
889    21.845638
890    26.000000
891    32.000000
Name: Age, Length: 891, dtype: float64

# Test Splitting

In [481]:
from sklearn.model_selection import train_test_split


#df_train.dropna(axis=0, subset=['Survived'], inplace=True)
y = df_train.Survived
df_train.drop(['Survived'], axis=1, inplace=True)

numerical_data = [i for i in df_train.columns if df_train[i].dtype in ["int64", "float64"]]
categorical_data = [i for i in df_train.columns if df_train[i].dtype == "object" and df_train[i].nunique() < 10]

X = df_train[numerical_data + categorical_data].copy()
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,
                                                      test_size = 0.2, random_state=0)
X_test = df_test[numerical_data + categorical_data].copy()

In [482]:
X_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked'], dtype='object')

# Pipeline Construction and Hyperparameter Tuning

In [483]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

#from xgboost import XGBClassifier

numerical_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[("impute", SimpleImputer(strategy="most_frequent")),
                                         ("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer( transformers=[("num",numerical_transformer, numerical_data),
                                               ("cat", categorical_transformer, categorical_data )])




param_grid = {
    'n_estimators': [100, 300, 500],       
    'max_depth': [None, 10, 20],           
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4],         
}
model2 = RandomForestClassifier(random_state=42)
model = GridSearchCV(model2, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


my_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                             ("model", model)
                             ])

In [484]:
my_pipeline.fit(X, y)

preds = my_pipeline.predict(X_valid)

accuracy = accuracy_score(y_valid , preds)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9274


In [485]:
test_pred = my_pipeline.predict(df_test)
out = {'PassengerId': df_test.index, 'Survived': test_pred}
outdf = pd.DataFrame(data = out)
outdf.to_csv('submission.csv', index = False)
outdf

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
