 Use titanic dataset. Do the required pre-processing in order to clean your data and scale the data in a range. Your task is to predict the survival chance of the passengers boarded on ship. Build the ensemble model using decision tree, logistic regressor and support vector machine classifiers. Make the final prediction by combining the output of all three classifiers together. Analyze the prediction ability of your model.


In [1]:
#Importing titanic dataset
import pandas as pd
titanic_df=pd.read_csv("https://raw.githubusercontent.com/kajalpanda1/DatasetRepository/master/Titanic.csv")

In [2]:
#Checking for NULL values
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  156 non-null    int64  
 1   Survived     156 non-null    int64  
 2   Pclass       156 non-null    int64  
 3   Name         156 non-null    object 
 4   Sex          156 non-null    object 
 5   Age          126 non-null    float64
 6   SibSp        156 non-null    int64  
 7   Parch        156 non-null    int64  
 8   Ticket       156 non-null    object 
 9   Fare         156 non-null    float64
 10  Cabin        31 non-null     object 
 11  Embarked     155 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 14.8+ KB


In [3]:
#Accessing first five records of the titanic dataset
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Accessing columns names of titanic dataset
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
#Dropping ineffective columns from the titanic dataset
titanic_df=titanic_df.drop(columns=['PassengerId','Name','Ticket','Fare','Cabin'])

In [6]:
#Accessing records after dropping columns
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [7]:
#Handling NULL values in Age column
titanic_df['Age']=titanic_df['Age'].fillna(titanic_df['Age'].median())

In [8]:
#Handling NULL values in Embarked column
titanic_df['Embarked']=titanic_df['Embarked'].fillna('ffill')

In [9]:
#Again checking for any NULL values
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  156 non-null    int64  
 1   Pclass    156 non-null    int64  
 2   Sex       156 non-null    object 
 3   Age       156 non-null    float64
 4   SibSp     156 non-null    int64  
 5   Parch     156 non-null    int64  
 6   Embarked  156 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 8.7+ KB


In [10]:
#Handling categorical values to extract correspond numerical values for Sex and Embarked column
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
titanic_df['Sex']=le.fit_transform(titanic_df['Sex'])
titanic_df['Embarked']=le.fit_transform(titanic_df['Embarked'])

In [11]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,2
1,1,1,0,38.0,1,0,0
2,1,3,0,26.0,0,0,2
3,1,1,0,35.0,1,0,2
4,0,3,1,35.0,0,0,2


In [12]:
#Extracting features and target dataframes from the titanic dataset
feature_df=titanic_df.iloc[:,1:].values
target_df=titanic_df.iloc[:,0].values

In [13]:
feature_df.shape

(156, 6)

In [14]:
target_df.shape

(156,)

In [15]:
#Standardizing the features of the titanic dataset
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
feature_df=ss.fit_transform(feature_df)

In [16]:
#Splitting titanic dataset into training and testing set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(feature_df,target_df,test_size=0.2,random_state=3)

In [17]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(124, 6)
(124,)
(32, 6)
(32,)


In [18]:
#Building Decision Tree Classifier model
from sklearn.tree import DecisionTreeClassifier
titanic_tree=DecisionTreeClassifier(max_depth=5,random_state=10)
model_dt=titanic_tree.fit(x_train,y_train)

In [19]:
model_dt

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=10, splitter='best')

In [20]:
#Predicting test instances with decision tree model
y_pred_dt=model_dt.predict(x_test)

In [21]:
#Training and testing accuracy of decision tree model
print("Training Accuracy:",model_dt.score(x_train,y_train))
print("Testing Accuracy:",model_dt.score(x_test,y_test))

Training Accuracy: 0.8548387096774194
Testing Accuracy: 0.875


In [22]:
#Building Logistic Regressor model
from sklearn.linear_model import LogisticRegression
lgr=LogisticRegression(solver='liblinear',random_state=10) 
model_lgr=lgr.fit(x_train,y_train)

In [23]:
model_lgr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
#Predicting test instances with logistic regressor model
y_pred_lgr=model_lgr.predict(x_test)

In [25]:
#Training and testing accuracy of logistic regressor model
print("Training Accuracy:",model_lgr.score(x_train,y_train))
print("Testing Accuracy:",model_lgr.score(x_test,y_test))

Training Accuracy: 0.7983870967741935
Testing Accuracy: 0.84375


In [26]:
#Building Support Vector Classifier model
from sklearn import svm
svc=svm.SVC(kernel='linear',C=0.1)
model_svc=svc.fit(x_train,y_train)

In [27]:
model_svc

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
#Predicting test instances with Support Vector Classifier model
y_pred_svc=model_svc.predict(x_test)

In [29]:
#Training and testing accuracy of Support Vector Classifier model
print("Training Accuracy:",model_svc.score(x_train,y_train))
print("Testing Accuracy:",model_svc.score(x_test,y_test))

Training Accuracy: 0.7983870967741935
Testing Accuracy: 0.84375


In [30]:
#ENSEMBLE VOTING CLASSIFIER
from sklearn.ensemble import VotingClassifier
estimators=[('decision tree', model_dt), ('logistic regression', model_lgr), ('svm', model_svc)]
ensemble = VotingClassifier(estimators, voting='hard')

#Fitting model with training instances
ensemble.fit(x_train, y_train)

VotingClassifier(estimators=[('decision tree',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=5,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort='deprecated',
                                                     random_state=1

In [31]:
#Training and testing accuracies for voting classifier model
print('Training Accuracy:',ensemble.score(x_train,y_train))
print('Testing Accuracy:',ensemble.score(x_test, y_test))

Training Accuracy: 0.8064516129032258
Testing Accuracy: 0.84375


In [32]:
#Predicting test instances with voting classifier
y_pred_vot=ensemble.predict(x_test)

In [33]:
#Classification report for ensemble voting classifier model
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_vot))

              precision    recall  f1-score   support

           0       0.95      0.83      0.88        23
           1       0.67      0.89      0.76         9

    accuracy                           0.84        32
   macro avg       0.81      0.86      0.82        32
weighted avg       0.87      0.84      0.85        32

