In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Load and Inspect Data

In [63]:
train_df = pd.read_csv('train.csv')

In [64]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [65]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Clean Data

In [66]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [67]:
train_df.Age.fillna(value=train_df.Age.median(), inplace=True)

In [68]:
train_df.Sex = train_df.Sex.map({'male':1, 'female':0})

In [69]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [71]:
x = train_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex']]
y = train_df['Survived']

# Train Test Split

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [83]:
x_tr_1, x_te_1, y_tr_1, y_te_1 = train_test_split(x, y)
print(x_tr_1)

     Pclass   Age  SibSp  Parch  Sex
504       1  16.0      0      0    0
175       3  18.0      1      1    1
483       3  63.0      0      0    0
22        3  15.0      0      0    0
108       3  38.0      0      0    1
..      ...   ...    ...    ...  ...
780       3  13.0      0      0    0
559       3  36.0      1      0    0
638       3  41.0      0      5    0
148       2  36.5      0      2    1
751       3   6.0      0      1    1

[668 rows x 5 columns]


In [74]:
scaler = StandardScaler()

Regularisation for Logistic Regression

In [84]:
train_features = scaler.fit_transform(x_tr_1)
test_features = scaler.transform(x_te_1)
print(train_features)

[[-1.51967017 -1.04653335 -0.47605684 -0.46924403 -1.36187402]
 [ 0.8356423  -0.89349958  0.44303556  0.76970858  0.73428231]
 [ 0.8356423   2.54976025 -0.47605684 -0.46924403 -1.36187402]
 ...
 [ 0.8356423   0.86638878 -0.47605684  5.725519   -1.36187402]
 [-0.34201394  0.5220628  -0.47605684  2.00866119  0.73428231]
 [ 0.8356423  -1.8117022  -0.47605684  0.76970858  0.73428231]]


In [76]:
model = LogisticRegression()

In [77]:
model.fit(train_features, y_tr_1)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
model.score(train_features, y_tr_1)

0.812874251497006

In [79]:
model.score(test_features, y_te_1)

0.7668161434977578

In [88]:
y_predictions = model.predict(test_features)

In [89]:
acc_score = accuracy_score(y_te_1, y_predictions)

In [90]:
print(acc_score)

0.852017937219731


# Load and Clean Test Data

In [85]:
test_df = pd.read_csv('test.csv')

In [86]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [87]:
test_df.Sex = test_df.Sex.map({'male':1, 'female':1})

In [91]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [92]:
test_df.Age.fillna(inplace=True, value=test_df.Age.median())

In [98]:
final_x_train_scaled = scaler.fit_transform(x)
final_x_test = test_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex']]
final_x_test_scaled = scaler.transform(final_x_test)

In [100]:
model.fit(final_x_train_scaled, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [102]:
y_predictions = model.predict(final_x_test_scaled)

In [103]:
test_df['Survived'] = pd.Series(y_predictions)

In [104]:
final_df = test_df.filter(['PassengerId', 'Survived'])

In [107]:
final_df.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [116]:
len(final_df[final_df.Survived == 1])

28