In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import os.path as path
%matplotlib inline 

plt.style.use("fivethirtyeight")

data_folder = '/Users/GraysTECH/BigQLabs/portfolio/machine-learning/kaggle-titanic/data/'

files = {"training_file":"train.csv", "testing_file":"test.csv"}

train_file = path.join(data_folder,files["training_file"])
test_file = path.join(data_folder,files["testing_file"])

#print(train_file, "\n",test_file)

titanic = pd.read_csv(train_file)
print(titanic.head(5))
print(titanic.describe())
print("Original columns in the dataset: {0} ".format(titanic.columns.tolist()))
print("Columns in the describe dataset: {0} ".format(titanic.describe().columns.tolist()))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  



#### Missing values Strategy
Notice from above that the describe method return a count of 714 for the Age column, compare that to the count of 891 for all the other columns. This means that the Age column has missing values. This means that the data isnt perfectly clean and we are going to clean it ourselves.
We don't want to remove the rows with the missing values because more the data the more it helps us to train the algorithm better. The other option is to not consider the Age column and leave it out but that is not a good option because the age of the person might affect the chances of survival. Our strategy to address the missing value will be to assign the median values. 

In [126]:
# assign meadian age to the Age column
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic["Age"].describe()

count    891.000000
mean      29.361582
std       13.019697
min        0.420000
25%       22.000000
50%       28.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [127]:
print("Cabin column count: ",titanic["Cabin"].count())

Cabin column count:  204


When we used describe above it's clear that not all columns were shown as part of the summary table. The missing columns had non-numerical data as a result could not be summarized, the missing columns were : Name, Sex, Cabin, Ticket, Embarked. We will ignore the Ticket, Cabin and Embarked column. There are only 204 values in the Cabin column and it likely isn't a particularly informative field. The Ticket and Embarked columns are unlikely to give us much information without the domain knowledge about what the ticket numbers represent and how do they corelate with the Name column.

The Sex column is non-numeric but we want to keep it because it could be very informative. We can convert it to a numeric column by assigning unique numeric codes to the male and female categories.
We will assign a code of 0 for male and 1 for female.
We can select all the male values in the Sex column using
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0


In [128]:
# Find the unique values in the sex column, the reason we want to do this exercise is to find out if the there 
# are any values other than male and female assigned to the column

print("Unique values in the Sex column: ", titanic["Sex"].unique())

# Encode the male and femal values in the Sex column with 0 and 1 values respectively
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

print("Unique values in the Sex column after encoding: ", titanic["Sex"].unique())



Unique values in the Sex column:  ['male' 'female']
Unique values in the Sex column after encoding:  [0 1]


The same way we can encode the values in the Embarked column to numeric codes thereby converting the text values to numeric.

In [129]:
# Find unique values in the Embarked column
print("Unique values in the Embarked column: ", titanic["Embarked"].unique())

print("Total frequency of the S value: ",titanic[titanic["Embarked"]=="S"]["Embarked"].count())

#print("Use Describe to get the summary statistics on the Embarked series:\r",titanic["Embarked"].describe())

# Notice there are nan Not a Number value associated with the certain rows, we have to fix this by assigning 
# the most common port in the dataset which is "S"
titanic["Embarked"] = titanic["Embarked"].fillna('S')

#print("Use Describe to get the summary statistics on the Embarked series after filling na's:\n",titanic["Embarked"].describe())

# We will assign the code 0 to S, 1 to C and 2 to Q
embarked_codes = {"S":0, "C":1, "Q":2}

titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
#for k,v in embarked_codes.items():
    #titanic.loc[titanic["Embarked"] == k, "Embarked"] == v
    
print("Use Describe to get the summary statistics on the Embarked series after encoding:\n",titanic["Embarked"].describe())

Unique values in the Embarked column:  ['S' 'C' 'Q' nan]
Total frequency of the S value:  644
Use Describe to get the summary statistics on the Embarked series after encoding:
 count     891
unique      3
top         0
freq      646
Name: Embarked, dtype: int64


#### Linear Regression:
If we wanted to predict if somebody survived in the titanic crash based on their age then we could use a technique called Linear regression, the algorithm takes the form y = mx + b where m is the coefficient called the slope of the line, b is the Y intercept and x is the predictor and y is the outcome we are trying to predict.
Linear regression is a very powerful algorithm if the data is linear corelation but it has a few downsides, for e.g. 1. If the column and the outcome is not linearly related the prediction cannot be accurate. 
2. It wont give you probabilities of the outcome only absolute probabilities.

#### Cross Validation:
We want to train the data on a different data set than we want to make the predictions on. This is critical if we want to avoid overfitting. Overfitting is what happens when the model fits itself to noise rather than signal, Every dataset has its own quirks that do not exist in the full populations. For example if you were asked to predict the top speed of a car by its horsepower and other charateristics and gave you a dataset that randomly had cars with very high speeds, the resulting model will always overstate the speed. The way to figure out if your model is doing this is to evaluate the performance of the model on data that it has not been trained on.

Every Machine Learning algorithm can overfit, although linear regression is less prone to it. Cross validation is a technique used to avoid overfitting, you split the data into parts or folds, lets say 3 for example, you then do this:
1. Combine the first two parts, train the model, make predictions on the third part.
2. Combine the first and third parts, train the model and make predictions on the second part.
3. Combine the second and third parts, train the modeland make predictions on the first part.

This way we generate predictions on the entire dataset without ever evaluating the accuracy on the same data.

In [130]:
# Making Predictions : We will be using scikit-learn to make the predictions
from sklearn.linear_model import LinearRegression

# Sklearn has a helper class that makes it easy to do cross-validation
from sklearn.cross_validation import KFold

# We will be using the following columns to preict the outcome
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]

# Initialize the algorithm
linreg = LinearRegression()

# Generate cross-validation folds for the titanic dataset. It returns row indices corresponding to the train and test
# datsets
# We set random_state to ensure we get the same splits everytime we execute the code
kf = KFold(titanic.shape[0],n_folds = 3, random_state = 1)
predictions = []
print("kfolds: ",kf)
for train, test in kf:
    # The predictors we are using to train the algorithm, Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we are using to train our algorithm
    train_target = (titanic["Survived"].iloc[train])
    
    # Training the algorithm using the predictors and the target
    linreg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = linreg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

kfolds:  sklearn.cross_validation.KFold(n=891, n_folds=3, shuffle=False, random_state=1)


#### Evaluating the Performance of the model:
Now that we have the predictions, we need to evaluate the error to measure the performance of the model. From the Kaggle Comptetition description the error metric is a percentage of correct predictions. We will use the same metric to evaluate our performance locally.

The metric will basically involve in finding the number of values in predictions that have the same exact value in the titanic["Survived"], then by dividing by the number of passengers.

Before we do this we need to combine the 3 sets of predictions into 1 column. Since each set of predictions is a numpy array, we can use a numpy function concatenate them into one.


In [131]:
import numpy as np

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
print("Accuracy: {0}".format(accuracy))

Accuracy: 0.7833894500561167




We have our first predictions! The accuracy is not that great it's about 78.3%. To improve the accuracy we will use Logistic Regression to instead output values between 0 and 1. One good way to think about the logistic regression is that it takes the output of the linear regression and maps it into a probability between 0 and 1. The mapping is done by the logit function. Passing any value to the logit function will map it to a value between 0 and 1 by squeezing the extreme values. This is good for us since we only care about two outcomes.

sklearn has a class for logistic regression that we can use. We will also make things easier by using an sklearn helper function to do all of our cross-validation and evaluation for us.


In [132]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

#Initialize our algorithm
alg = LogisticRegression(random_state = 1)

#Compute the accuracy scores for all the cross-validation folds.
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print(scores)
print(scores.mean())


[ 0.78451178  0.78787879  0.79124579]
0.787878787879


#### Process the Test dataset and predict the survival of the passenger



In [133]:
titanic_test = pd.read_csv(test_file)

# assign meadian age to the Age column
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())

# Encode the male and femal values in the Sex column with 0 and 1 values respectively
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

# Notice there are nan Not a Number value associated with the certain rows, we have to fix this by assigning 
# the most common port in the dataset which is "S"
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2
print("Total records in test set: {0}".format(titanic_test.shape))
titanic_test.describe()

Total records in test set: (418, 11)




count    417.000000
mean      35.627188
std       55.907576
min        0.000000
25%             NaN
50%             NaN
75%             NaN
max      512.329200
Name: Fare, dtype: float64