# Importing Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Getting Data

In [2]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
testing_data = pd.read_csv("/kaggle/input/titanic/test.csv")
if 'Survived' not in testing_data.columns:
    testing_data['Survived'] = 0

# EDA : Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.corr

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


<bound method DataFrame.corr of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                   

In [7]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Data Preprocessing 

## One-hot Encoding and Feature Engineering 

In [8]:
def Preprocess(df_test, df_train=pd.read_csv("/kaggle/input/titanic/train.csv")):    
    df = pd.concat([df_train, df_test], axis=0)
    df = df.drop('Name', axis = 1)
    df = df.drop('Ticket', axis = 1)
    df['Cabin'] = df['Cabin'].fillna('X000')
    df['Embarked'] = df['Embarked'].fillna('X')
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # Apply the split_cabin_multiple function to each row
    df['cabin_letter'] = df['Cabin'].str.extract(r'([a-zA-Z]+)', expand=False)
    df['cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand=False)

    # One-hot encode the "cabin_letter" column
    df_encoded = pd.get_dummies(df, columns=['cabin_letter'], prefix='cabin')
    df_encoded = pd.get_dummies(df_encoded, columns=['Embarked'], prefix='Embarked')
    df_encoded = pd.get_dummies(df_encoded, columns=['Sex'])
    # Drop the original "cabin" column
    df_encoded = df_encoded.drop(columns=['Cabin'])
    df_encoded = df_encoded.drop(columns=['cabin_X'])
    df_encoded = df_encoded.drop(columns=['Embarked_X'])
    df = df_encoded
    df['cabin_number'] = df['cabin_number'].fillna(0)
    df['cabin_number'] = pd.to_numeric(df['cabin_number'])

    #Feature Engineering
    df['Pclass_bin_Fare'] = df['Fare'] // df['Pclass'] 
    df['Pclass_bin_Sex'] =  df['Pclass'] - df['Sex_female']

    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop(columns=['Survived'])

    return df_train, df_test

In [9]:
df, testing_data = Preprocess(testing_data)

In [10]:
df.isna().sum()

PassengerId        0
Survived           0
Pclass             0
Age                0
SibSp              0
Parch              0
Fare               0
cabin_number       0
cabin_A            0
cabin_B            0
cabin_C            0
cabin_D            0
cabin_E            0
cabin_F            0
cabin_G            0
cabin_T            0
Embarked_C         0
Embarked_Q         0
Embarked_S         0
Sex_female         0
Sex_male           0
Pclass_bin_Fare    0
Pclass_bin_Sex     0
dtype: int64

In [11]:
df.corr()['Survived']

PassengerId       -0.005007
Survived           1.000000
Pclass            -0.338481
Age               -0.070323
SibSp             -0.035322
Parch              0.081629
Fare               0.257307
cabin_number       0.229756
cabin_A            0.022287
cabin_B            0.175095
cabin_C            0.114652
cabin_D            0.150716
cabin_E            0.145321
cabin_F            0.057935
cabin_G            0.016040
cabin_T           -0.026456
Embarked_C         0.168240
Embarked_Q         0.003650
Embarked_S        -0.155660
Sex_female         0.543351
Sex_male          -0.543351
Pclass_bin_Fare    0.267823
Pclass_bin_Sex    -0.533994
Name: Survived, dtype: float64

# Splitting Data

In [12]:
X = df.drop(columns=['Survived'])
y = df['Survived']
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, test_size=0.2)
Train_Y = np.reshape(Train_Y,(-1, 1))

# Trying Different Models

## Logistic Regression

In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(Train_X, Train_Y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
accuracy_score(Test_Y, model.predict(Test_X))

0.7932960893854749

## XGBoost

In [15]:
model_2 = XGBClassifier(enable_categorical = True)
model_2.fit(Train_X, Train_Y)

In [16]:
accuracy_score(Test_Y, model_2.predict(Test_X))

0.7932960893854749

## Decision Tree

In [17]:
model_3 = DecisionTreeClassifier()
model_3.fit(Train_X, Train_Y)

In [18]:
accuracy_score(Test_Y, model_3.predict(Test_X))

0.7430167597765364

## Random Forest

In [19]:
model_4 = RandomForestClassifier()
model_4.fit(Train_X, Train_Y.squeeze())


In [20]:
accuracy_score(Test_Y, model_4.predict(Test_X))

0.7988826815642458

# Selecting Model and Saving csv file

In [21]:
pred = model.predict(testing_data)

final = pd.DataFrame()
final['PassengerId'] = testing_data['PassengerId']
final['Survived'] = pred

final.to_csv('gender_submission.csv', index=False)