In [1]:
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder

In [2]:
dfolder = Path('./datasets')
train_data = 'train.csv'
df_train = pd.read_csv(dfolder/train_data)

In [3]:
def process_dataset(data):
    # Replace nulls with XXXXX
    data['Cabin'].fillna('XXXXX', inplace=True)
    # Create new column with only first letter of Cabin
    data['Cabin1'] = data['Cabin'].str[0]
    # Replace nulls with median Age
    data['Age'] = data['Age'].fillna(data['Age'].median())

    # One Hot Encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(data[['Sex', 'Cabin1']])
    encoded_df = pd.DataFrame(encoded_features,
                              columns=encoder.get_feature_names_out(['Sex', 'Cabin1']))
    result_df = pd.concat([data, encoded_df], axis=1)
    result_df = result_df.drop(columns=['Sex', 'Cabin1'])


    print(data.columns)
    print(result_df.columns)

    return result_df

In [4]:
df = process_dataset(df_train)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Cabin1'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_female', 'Sex_male',
       'Cabin1_A', 'Cabin1_B', 'Cabin1_C', 'Cabin1_D', 'Cabin1_E', 'Cabin1_F',
       'Cabin1_G', 'Cabin1_T', 'Cabin1_X'],
      dtype='object')


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Sex_male,Cabin1_A,Cabin1_B,Cabin1_C,Cabin1_D,Cabin1_E,Cabin1_F,Cabin1_G,Cabin1_T,Cabin1_X
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,XXXXX,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,XXXXX,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,XXXXX,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
drop_cols = ['PassengerId', 'SibSp', 'Name', 'Embarked', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Sex_female', 'Cabin1_X']
df1 = df.drop(columns=drop_cols)
df1.columns

Index(['Survived', 'Pclass', 'Age', 'Sex_male', 'Cabin1_A', 'Cabin1_B',
       'Cabin1_C', 'Cabin1_D', 'Cabin1_E', 'Cabin1_F', 'Cabin1_G', 'Cabin1_T'],
      dtype='object')

In [7]:
# Create features dataframe
X = df1.copy()
# Pop dependent variable
y = X.pop('Survived')
print(X)

     Pclass   Age  Sex_male  Cabin1_A  Cabin1_B  Cabin1_C  Cabin1_D  Cabin1_E  \
0         3  22.0       1.0       0.0       0.0       0.0       0.0       0.0   
1         1  38.0       0.0       0.0       0.0       1.0       0.0       0.0   
2         3  26.0       0.0       0.0       0.0       0.0       0.0       0.0   
3         1  35.0       0.0       0.0       0.0       1.0       0.0       0.0   
4         3  35.0       1.0       0.0       0.0       0.0       0.0       0.0   
..      ...   ...       ...       ...       ...       ...       ...       ...   
886       2  27.0       1.0       0.0       0.0       0.0       0.0       0.0   
887       1  19.0       0.0       0.0       1.0       0.0       0.0       0.0   
888       3  28.0       0.0       0.0       0.0       0.0       0.0       0.0   
889       1  26.0       1.0       0.0       0.0       1.0       0.0       0.0   
890       3  32.0       1.0       0.0       0.0       0.0       0.0       0.0   

     Cabin1_F  Cabin1_G  Ca

In [8]:
print(X)
print(y)

     Pclass   Age  Sex_male  Cabin1_A  Cabin1_B  Cabin1_C  Cabin1_D  Cabin1_E  \
0         3  22.0       1.0       0.0       0.0       0.0       0.0       0.0   
1         1  38.0       0.0       0.0       0.0       1.0       0.0       0.0   
2         3  26.0       0.0       0.0       0.0       0.0       0.0       0.0   
3         1  35.0       0.0       0.0       0.0       1.0       0.0       0.0   
4         3  35.0       1.0       0.0       0.0       0.0       0.0       0.0   
..      ...   ...       ...       ...       ...       ...       ...       ...   
886       2  27.0       1.0       0.0       0.0       0.0       0.0       0.0   
887       1  19.0       0.0       0.0       1.0       0.0       0.0       0.0   
888       3  28.0       0.0       0.0       0.0       0.0       0.0       0.0   
889       1  26.0       1.0       0.0       0.0       1.0       0.0       0.0   
890       3  32.0       1.0       0.0       0.0       0.0       0.0       0.0   

     Cabin1_F  Cabin1_G  Ca

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [10]:
def rf_class(X_list, y, **kwargs):
    # Set default kwargs
    defaults = {
        'test_size': 0.2,
        'random_state': 42,
        'n_estimators': 100
    }
    # Update defaults
    defaults.update(kwargs)

    X_train, X_test, y_train, y_test = train_test_split(
        X_list, y, test_size=defaults['test_size'], random_state=defaults['random_state'])

    model = RandomForestClassifier(
        n_estimators=defaults['n_estimators'], random_state=defaults['random_state'])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

In [11]:
df1.head()

Unnamed: 0,Survived,Pclass,Age,Sex_male,Cabin1_A,Cabin1_B,Cabin1_C,Cabin1_D,Cabin1_E,Cabin1_F,Cabin1_G,Cabin1_T
0,0,3,22.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,38.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1,3,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,35.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,3,35.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
rf_class(X, y, test_size=0.5, random_state=100)

0.7959641255605381


In [13]:
df1.columns

Index(['Survived', 'Pclass', 'Age', 'Sex_male', 'Cabin1_A', 'Cabin1_B',
       'Cabin1_C', 'Cabin1_D', 'Cabin1_E', 'Cabin1_F', 'Cabin1_G', 'Cabin1_T'],
      dtype='object')