# Import Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")

# Fill up the missing values - Embarked and Age
### Since most travellers embarked from Southampton so I replaced the missing value with S
### For age I have taken median as Max age was 80 and mean is not suitable 
#### Dropping the unwanted columns Ticket, PassengerID, Name etc

In [3]:
train.loc[train['Embarked'].isna(), 'Embarked']='S'
train.loc[train['Age'].isna(), 'Age']= train.Age.median()
train = train.drop(columns=['Ticket','PassengerId','Name','Cabin'], axis=1)

In [4]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Splitting of data into x-independent and y-dependent

In [5]:
x = train.drop('Survived', axis=1)
y = train.drop(train.iloc[:,1:], axis=1)
y = np.ravel(y)

In [6]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [7]:
x.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

# Encoding for ML algorithms to work

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x.iloc[:,1] = le.fit_transform(x.iloc[:,1].values)
x.iloc[:,6] = le.fit_transform(x.iloc[:,6].values)
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


# Feature Scalling

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feature_scale = ['Age','Fare']
x[feature_scale] = sc.fit_transform(x[feature_scale])

In [10]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,-0.565736,1,0,-0.502445,2
1,1,0,0.663861,1,0,0.786845,0
2,3,0,-0.258337,0,0,-0.488854,2
3,1,0,0.433312,1,0,0.42073,2
4,3,1,0.433312,0,0,-0.486337,2


# Splitting in train and test dataset

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
x_train.shape

(712, 7)

In [13]:
x_test.shape

(179, 7)

In [14]:
y_train.shape

(712,)

In [15]:
y_test.shape

(179,)

In [16]:
from sklearn.metrics import precision_score, accuracy_score
from sklearn.metrics import confusion_matrix

# Applying 7 Classification Algorithms to the train and test datasets

# 1. Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(
    penalty='l2',
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    solver='lbfgs',
    max_iter=100,
    multi_class='auto',
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None,)
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.810
Precision: 0.786


# 2. Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(priors=None, var_smoothing=1e-09)
nb.fit(x_train, y_train)
y_pred = nb.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.777
Precision: 0.713


# 3.KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier( 
    n_neighbors=5,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='minkowski',
    metric_params=None,
    n_jobs=None,)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.799
Precision: 0.750


# 4.Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=100,
    min_samples_split=2,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,)
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.849
Precision: 0.851


# 5.Random Forrest

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=101,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.827
Precision: 0.852


# 6.Stochastic Gradient Descent

In [22]:
from sklearn.linear_model import SGDClassifier
sgd =  SGDClassifier(    
    loss='hinge',
    penalty='l2',
    alpha=0.0001,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=1000,
    tol=0.001,
    shuffle=True,
    verbose=0,
    epsilon=0.1,
    n_jobs=None,
    random_state=None,
    learning_rate='optimal',
    eta0=0.0,
    power_t=0.5,
    early_stopping=False,
    validation_fraction=0.1,
    n_iter_no_change=5,
    class_weight=None,
    warm_start=False,
    average=False,)
sgd.fit(x_train, y_train)
y_pred=sgd.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.726
Precision: 0.963


# 7. Support Vector Machines

In [23]:
from sklearn.svm import SVC
svm =  SVC(
    C=1.0,
    kernel='rbf',
    degree=3,
    gamma='scale',
    coef0=0.0,
    shrinking=True,
    probability=False,
    tol=0.001,
    cache_size=200,
    class_weight=None,
    verbose=False,
    max_iter=-1,
    decision_function_shape='ovr',
    break_ties=False,
    random_state=None,)
svm.fit(x_train, y_train)
y_pred=svm.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))

Accuracy: 0.816
Precision: 0.806


# Since I have found the most accurate model from above, so I will apply the model on the whole train dataset and bring the test dataset to predict the unknown y value and submit the same to kaggle. 

In [24]:
test = pd.read_csv("test.csv")


test.loc[test['Fare'].isna(), 'Fare']=test.Fare.median()
test.loc[test['Age'].isna(), 'Age']= test.Age.median()
test1 = test.drop(columns=['Ticket','PassengerId','Name','Cabin'], axis=1)

In [25]:
test1.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [26]:
test1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [27]:
test1.shape

(418, 7)

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test1.iloc[:,1] = le.fit_transform(test1.iloc[:,1].values)
test1.iloc[:,6] = le.fit_transform(test1.iloc[:,6].values)
test1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [29]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feature_scale = ['Age','Fare']
test1[feature_scale] = sc.fit_transform(test1[feature_scale])

In [30]:
test1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,0.386231,0,0,-0.497413,1
1,3,0,1.37137,1,0,-0.512278,2
2,2,1,2.553537,0,0,-0.4641,1
3,3,1,-0.204852,0,0,-0.482475,2
4,3,0,-0.598908,1,1,-0.417492,2


In [31]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=100,
    min_samples_split=2,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,)
dtc.fit(x, y)
y_pred = dtc.predict(test1)


In [32]:
y_pred.shape

(418,)

In [33]:
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [34]:
submission = pd.DataFrame({
    "PassengerId" : test['PassengerId'],
    "Survived" : y_pred
})

In [35]:
submission.to_csv('titanic_pred', index=False)