In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Data Analysis and Exploration

In [2]:
# if you want to display full dataset, change '20' to 'None'
pd.set_option('display.max_rows', 20)

# overview the training set
X_train = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')

X_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data Pre-Processing: Missing Value

In [3]:
# first check for the missings value
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# handle feature "age" missings value by zero
def missing_age(X):
    age_col = pd.DataFrame(X, columns=['Age'])
    X['Age'] = age_col.fillna(0)

missing_age(X_train)
missing_age(X_test)
    
X_train['Age'].isnull().sum()

0

In [5]:
# handle feature 'fare' missings value by social class mode
def missing_fare(X):
    fare_class = {
        1: X[X['Pclass'] == 1]['Fare'].mean(),
        2: X[X['Pclass'] == 2]['Fare'].mean(),
        3: X[X['Pclass'] == 3]['Fare'].mean()
    }
    for passenger_index in range(X.shape[0]):
        passenger_fare = X.iloc[passenger_index]['Fare']
        if pd.isna(passenger_fare):
            passenger_class = X.iloc[passenger_index]['Pclass']
            for class_type in fare_class:
                if passenger_class == class_type:
                    X.loc[passenger_index, 'Fare'] = fare_class[class_type]
                    
missing_fare(X_test)

X_test['Fare'].isnull().sum()

0

In [6]:
# handle feature 'cabin' missings value by social class mode and family name
def missing_cabin(X):
    
    fcabin_dict = {}
    
    for passenger_index in range(X.shape[0]):
        family_name = X.iloc[passenger_index]['Name'].split(' ', 1)[0]
        passenger_cabin = X.iloc[passenger_index]['Cabin']
        if family_name not in fcabin_dict:
            if pd.isna(passenger_cabin):
                fcabin_dict[family_name] = ''
            else:
                fcabin_dict[family_name] = passenger_cabin
            
    cabin_class = {
        1: X[X['Pclass'] == 1]['Cabin'].mode().iloc[0],
        2: X[X['Pclass'] == 2]['Cabin'].mode().iloc[0],
        3: X[X['Pclass'] == 3]['Cabin'].mode().iloc[0]
    }
    
    for passenger_index in range(X.shape[0]):
        passenger_cabin = X.iloc[passenger_index]['Cabin']
        if pd.isna(passenger_cabin):
            passenger_class = X.iloc[passenger_index]['Pclass']
            passenger_fname = X.iloc[passenger_index]['Name'].split(' ', 1)[0]
            for class_type in cabin_class:
                if passenger_class == class_type:
                    family_cabin = fcabin_dict[passenger_fname]
                    if family_cabin == '':
                        random_cabin = cabin_class[class_type] + str(np.random.randint(999))
                        X.loc[passenger_index, 'Cabin'] = random_cabin
                        fcabin_dict[passenger_fname] = random_cabin
                    else:
                        X.loc[passenger_index, 'Cabin'] = family_cabin

missing_cabin(X_train)
missing_cabin(X_test)

X_train['Cabin'].isnull().sum()

0

In [7]:
# handle feature "embarked" missings value by embarked mode
def missing_embarked(X):
    embarked_mode = X['Embarked'].mode().iloc[0]
    X['Embarked'] = X['Embarked'].fillna(embarked_mode)

missing_embarked(X_train)
missing_embarked(X_test)
    
X_train['Embarked'].isnull().sum()

0

In [8]:
# last check for the missings value
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Data Pre-Processing: Feature Increment

In [9]:
# first check for the features
X_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,G6522,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,G6865,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,G6275,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,D188,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,G6809,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
# create new feature 'deck' by type of cabin classification, e.g. 'C321' to 'C'
def create_deck(X):
    X['Deck'] = ''
    for passenger_index in range(X.shape[0]):
        passenger_cabin = X.iloc[passenger_index]['Cabin']
        passenger_class = X.iloc[passenger_index]['Pclass']
        X.loc[passenger_index, 'Deck'] = list(passenger_cabin)[0]

create_deck(X_train)
create_deck(X_test) # a lots of nan value of its passenger cabin list

X_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,G6522,S,G
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,G6865,S,G
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,G6275,S,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,D188,S,D
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,G6809,S,G
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,C


In [11]:
# simplify the feature 'name' by type of honorific title name, e.g. below
honorific_title = {
    'Mr': ['Mr.', 'Major.', 'Rev.', 'Dr.', 'Col.', 'Capt.', 'Countess.', 'Don.'],
    'Mrs': ['Mrs.', 'Ms.','Lady.', 'Mlle.'],
    'Master': ['Master.'],
    'Miss': ['Miss.']
}

def simplify_name(X):
    X['TitleName'] = ''
    for passenger_index in range(X.shape[0]):
        passenger_title = X.iloc[passenger_index]['Name'].split(' ', 2)[1]
        for title in honorific_title:
            if passenger_title in honorific_title[title]:
                X.loc[passenger_index, 'TitleName'] = title

# because we are modifying dataset metadata, so we must update the test dataset too
simplify_name(X_train)
simplify_name(X_test)


X_train['TitleName']

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886      Mr
887    Miss
888    Miss
889      Mr
890      Mr
Name: TitleName, Length: 891, dtype: object

In [12]:
# last check for the features
X_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,TitleName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,G6522,S,G,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,G6865,S,G,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,C,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,G6275,S,G,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,D188,S,D,Mr
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,B,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,G6809,S,G,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,C,Mr


# Data Pre-Processing: Feature Decrement

In [13]:
# remove unnecessary feature like 'id'
X_ids = X_train.pop('PassengerId')
X_test_ids = X_test.pop('PassengerId')

# preparing target feature
y_train = X_train.pop('Survived')

len(y_train)

891

In [14]:
# last check for the features
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,TitleName
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,G6522,S,G,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,G6865,S,G,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,C,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,G6275,S,G,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,D188,S,D,Mr
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,B,Miss
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,G6809,S,G,Miss
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,C,Mr


# Data Scaling

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     891 non-null    int64  
 1   Name       891 non-null    object 
 2   Sex        891 non-null    object 
 3   Age        891 non-null    float64
 4   SibSp      891 non-null    int64  
 5   Parch      891 non-null    int64  
 6   Ticket     891 non-null    object 
 7   Fare       891 non-null    float64
 8   Cabin      891 non-null    object 
 9   Embarked   891 non-null    object 
 10  Deck       891 non-null    object 
 11  TitleName  891 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# categorical and numeric features pipeline
cat_col = X_train.select_dtypes(include=['object']).columns.tolist()
num_col = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

transformer = ColumnTransformer(
    transformers = [
        ('cat', encoder, cat_col),
        ('num', scaler, num_col)
    ]
)

# Data Splitting

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.80, random_state=42)

print(len(X_train), len(y_train))

712 712


# Model Training

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

linear_model = Pipeline(
    steps = [
        ('transformer', transformer),
        ('linear', LinearRegression())
    ]
)

linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

mse

0.12597099102336495

# Test CSV Submission

In [19]:
test_id = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')['PassengerId']

test_pred = linear_model.predict(X_test)

pd.DataFrame({
    'PassengerId': test_id,
    'Survived': test_pred
}).to_csv('/kaggle/working/submission.csv', index=False)