In [None]:
#Kaggle Titenic Prediction
#Jibran Karim
#2:43
#11/9/2018

In [69]:
#importing models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [2]:
#importing other libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
#importing libraries to check accuracy of models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
#importing the data sets
train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")

In [4]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#changing the male feature to numeric value so we can use it for prediction
#we make a new feature named Gender on both sets and store numeric values of sex in it
train_set['Gender'] = pd.factorize(train_set['Sex'])[0]
test_set['Gender'] = pd.factorize(test_set['Sex'])[0]

In [58]:
#to estimate missing Age values we will use Gender and Pclass, we may also use name to elicit a tile to get more details about
#the age so we will extract name title from each name and store it in list for both data sets
train_set['Title'] = train_set.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
test_set['Title'] = test_set.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())

#model can only understand numbers so we factorize titles to numbers as well
train_set['Title'] = pd.factorize(train_set['Title'])[0]
test_set['Title'] = pd.factorize(test_set['Title'])[0]
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,1


In [66]:
#now we can use Pclass, gender and title to fill the missing values of age according to the selected features

grouped_features_train = train_set.groupby(['Pclass','Gender','Title'])
grouped_features_test = test_set.groupby(['Pclass','Gender','Title'])

##filling avegrage values of age for the grouped features

train_set.Age = grouped_features_train.Age.apply(lambda x: x.fillna(x.median()))
test_set.Age = grouped_features_train.Age.apply(lambda y: y.fillna(y.median()))

In [60]:
train_set.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
Gender         False
Title          False
dtype: bool

In [65]:
test_set.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
Gender         False
Title          False
dtype: bool

In [67]:
# now we have our desired data without any missing values
#we can now make our test and train sets from train_set to train our model and check accuracy score

X = train_set[['Pclass','Gender','Age','Title']]
y = train_set.Survived
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.25)

In [70]:
#now we can check multiple models for greater accuracy one by one

#for knn we find best tuning perameter
scores = []
for k in range(1,31):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain,ytrain)
    prediction = knn.predict(xtest)
    scores.append(accuracy_score(prediction,ytest))

#now cross validating knn model

knn = KNeighborsClassifier(n_neighbors=4)
accuracy = cross_val_score(knn,xtrain,ytrain,cv=10)
print("Accuracy for KNN: ",accuracy.mean())

#now cross validating for logistic regression model

logRig = LogisticRegression()
accuracy = cross_val_score(logRig,xtrain,ytrain,cv=10)
print("Accuracy for Logistic Regression: ",accuracy.mean())

#now cross validation for Tree
tree = tree.DecisionTreeClassifier()
accuracy = cross_val_score(tree,xtrain,ytrain,cv=10)
print("Accuracy for Tree: ",accuracy.mean())

Accuracy for KNN:  0.7440843908798255
Accuracy for Logistic Regression:  0.7949018277595977
Accuracy for Tree:  0.7844075079149705


In [85]:
# we see, logistic regression seems most accurate, we will fit our train data to it and predict values for test data

logRig = LogisticRegression()
logRig.fit(X,y)

# we need another data frame for test data as we made for train data which is ready to use
# we also need another list to store passenger IDs so we can map it to prediction

XTest = test_set[['Pclass','Gender','Age','Title']]
PassID = test_set.PassengerId

survived=logRig.predict(XTest)

#now we can combine passenger id and survied 
Prediction_set = pd.DataFrame({'PassengerId': PassID, 'Survived': survived})


Prediction_set["Survived"].value_counts()

#now we can write a csv file

Prediction_set.to_csv("New prediction Set.csv",index=False)
