In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [37]:
#creating the training dataframe
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [38]:
#Changing Sex to binary values and dropping NaN values from Age and Sex
df.dropna(subset=['Sex', 'Age'])
sex = {'male': 0,'female': 1}
df.Sex = [sex[item] for item in df.Sex]

In [39]:
#computing statistics for each column of the dataframe
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [40]:
#Looking at correlation matrix of the dataframe
df.corr()
#This tells us that Sex has the highest positive correlation with Survived. 
#We can also see that Fare has a weak positive correlation with Survived, and Pclass has a weak negative correlation with Survived

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495
Sex,-0.042939,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333
Age,0.036847,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0


In [44]:
X = df[['Sex', 'Fare', 'Pclass']]
y = df['Survived']
#Splitting training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [45]:
#creating the logistic regression model and fitting the training data to it
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy

0.753731343283582

In [100]:
#Creating the test dataframe and changing Sex to binary values and dropping NaN values from Sex
test_df = pd.read_csv('test.csv')
test_df.head(10)
sex = {'male': 0,'female': 1}
test_df.Sex = [sex[item] for item in test_df.Sex]
test_df.dropna(subset=['Sex'])
#Replacing NaN values in Fare with the median value
test_df["Fare"].fillna(test_df["Fare"].median(skipna=True), inplace=True)

In [106]:
train = df[['Sex', 'Fare', 'Pclass']]
test = test_df[['Sex', 'Fare', 'Pclass']]
test_id = test_df.PassengerId
model = LogisticRegression()
model.fit(train, y)
submission = pd.DataFrame({"PassengerId": test_id, "Survived": model.predict(test)})
submission.to_csv('submission.csv', index=False)



In [107]:
#Evaluating the model
model_pred = model.predict(X_test)
print(model.score(X_test,y_test))
print(classification_report(y_test, model_pred))

0.7574626865671642
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       153
           1       0.76      0.64      0.69       115

    accuracy                           0.76       268
   macro avg       0.76      0.74      0.75       268
weighted avg       0.76      0.76      0.75       268

