# First Attempt at Titanic Dataset from Kaggle

In [33]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

In [34]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')

## Exploring the dataset

Taking a look at the training data provided

In [35]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
train_data.shape

(891, 12)

The above shows us all the details of the data including the number of features and the amount of data available

## Starting with training the model

Setting target and train columns
For my first attempt I am choosing the features Pclass, Age and Sex to train my model

In [37]:
column_target = ['Survived']
column_train = ['Pclass', 'Sex', 'Age']
column_test = ['PassengerId', 'Survived']

Now I am assigning the training features to X axis and the target to Y axis

In [38]:
X = train_data[column_train]
Y = train_data[column_target]
X_train = test_data[column_train]
Y_train = train_data[column_test]

Now we will check is there are any NaN values in the features we have selected

In [39]:
X['Pclass'].isnull().sum()

0

In [40]:
X['Sex'].isnull().sum()

0

In [41]:
X['Age'].isnull().sum()

177

Now as the Age feature has NaN values we need to do something with those empty rows in the Age feature

For my first attempt I am going to consider the mean value of age to fill in the NaN Age rows

In [47]:
X['Age'] = X['Age'].fillna(X['Age'].mean())
#X_train['Age'] = X_train['Age'].fillna(X['Age'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
#Confirming the NaN values in Age feature now
X['Age'].isnull().sum()

0

Now as the Sex feature is not in int values so we will preprocess the data by setting 0 for males and 1 for females

In [49]:
d = {'male': 0, 'female': 1}
X['Sex'] = X['Sex'].apply(lambda x:d[x])
X_train['Sex'] = X_train['Sex'].apply(lambda x:d[x])
X['Sex'].head()

KeyError: 0

## Generating results using the LinearSVC model

In [50]:
#training using the complete train file now
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X,Y)
#test_data[['PassengerId', 'Survived']].to_csv('datasets/titanic/men_died.csv', index=False)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [51]:
test_data['Survived'] = clf.predict(X_train)
test_data[['PassengerId', 'Survived']].to_csv('datasets/first_attempt.csv', index=False)
print (test_data[['PassengerId', 'Survived']])

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
5            897         0
6            898         0
7            899         0
8            900         0
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         0
15           907         1
16           908         0
17           909         0
18           910         0
19           911         0
20           912         0
21           913         0
22           914         1
23           915         0
24           916         0
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         0
392         1284         0
3