In [None]:
import pandas as pd
from pandas import Series, DataFrame

In [None]:
titanic_df = pd.read_csv('/root/hackerday/01_titanic/train.csv')

Read more about this function by running `pd.read_csv?`

In [None]:
# Accessing Columns
print titanic_df['Name'].head() # one column
print titanic_df[['Name', 'Pclass']].head() # Two columns

In [None]:
titanic_df.shape

---
### Looking at the data

- We know that women and children were more likely to survive. Thus, Age and Sex are probably good predictors. 

- It's also logical to think that passenger class might affect the outcome, as first class cabins were closer to the deck of the ship. 

- Fare is tied to passenger class, and will probably be highly correlated with it, but might add some additional information. 

- Number of siblings and parents/children will probably be correlated with survival one way or the other, as either there are more people to help you, or more people to think about and try to save.

### Some more information from inspecting the data

- Age, Cabin have missing values

---

### Steps in the Data Science Process

1. Data Inspection
2. Data Understanding
3. Data Preparation (cleaning - treat missing values, treat outliers and other bad data, Normalization)
4. Split the Data
5. Visualize
6. Train the Model
7. Test the Model (deploy on unseen data)
8. Evaluate the performance of the Model

---

### Statistical Summaries using `describe()`

In [None]:
titanic_df.describe()

In [None]:
# Numeric Series - Binary
titanic_df['Survived'].describe()

In [None]:
# Numeric Series - Float
titanic_df['Fare'].describe()

In [None]:
# Categorical Series
titanic_df['Sex'].describe()

In [None]:
titanic_df['Pclass'].value_counts()

---

### The `plot()` method to inspect data visually

In [None]:
import matplotlib.pyplot as plt
%pylab inline

# This will enable plotting functionality
# and make sure the plots are displayed inside the notebook

In [None]:
titanic_df['Fare'].plot(kind="hist")

In [None]:
# Task 1
#
# MAKE A HISTOGRAM FOR AGE - Make a note of what you see
#
#

In [None]:
titanic_df['Fare'].plot(kind='hist')
# Fare is an example of a SKEWED variable

In [None]:
print titanic_df.Fare.mean()
print titanic_df.Fare.median()

In [None]:
titanic_df['Age'].plot(kind="hist")
# Age is an example of a (approximately) Normally Distributed Variable

In [None]:
# Task 2
#
# MAKE A BAR CHART FOR Parch - Make a note of what you see
#
#

In [None]:
titanic_df['Parch'].value_counts().plot(kind='bar')

In [None]:
titanic_df['SibSp'].value_counts().plot(kind='bar')

---

### The `groupby()` method to inspect relationships in the data

- Pretty much the same as the SQL group-by.
- Can also interpret it as having the same functionality as Pivot Tables in Excel.

In [None]:
titanic_df[['Survived', 'Age', 'Fare']].groupby('Survived').mean()

In [None]:
titanic_df[['Pclass', 'Survived', 'Age', 'Fare']].groupby('Pclass').median()

In [None]:
# Task 3
#
# Group by number of siblings and see how much they paid on average for a ticket
#
#

In [None]:
titanic_df[['SibSp', 'Fare']].groupby('SibSp').mean()

In [None]:
# Task 4
#
# What is the % of survivors by Pclass
#
#

In [None]:
titanic_df.Survived.mean() == titanic_df['Survived'].mean()

In [None]:
titanic_df[['Pclass', 'Survived']].groupby('Pclass').mean()

---

### Handling Missing Data with `fillna()`

As we can see that PassengerId counr is 891 whereas Age count is 714. It means there are 77 rows in which Age is missing.

This means that the data isn't perfectly clean, and we're going to have to clean it ourselves. 

Note:

- We don't want to have to remove the rows with missing values, because more data helps us train a better algorithm. 
- We also don't want to get rid of the whole column, as age is probably fairly important to our analysis.

There are many strategies for cleaning up missing data, but a simple one is to just fill in all the missing values with the median of all the values in the column. 

---
#### Fill Missing Values for the Age Variable (Numeric)

In [None]:
titanic_df['Age'].isnull().tail()

In [None]:
# we can get the median of the column by applying median function on it.
print titanic_df["Age"].median()
print titanic_df["Age"].mean()

In [None]:
Age_median = titanic_df["Age"].median()

In [None]:
# Using the fillna method to impute datab
titanic_df["Age"].fillna(Age_median, inplace=True)
# Alternate method
titanic_df["Age"] = titanic_df["Age"].fillna(Age_median)

In [None]:
titanic_df['Age'].tail()

In [None]:
titanic_df.Age.describe()

---

#### Fill Missing Values for the Embarked Variable (Categorical)

In [None]:
# Task 5
# Find some rows that have missing values for variable Embarked
titanic_df[titanic_df.Embarked.isnull()].head()

In [None]:
# To find the mode
titanic_df.Embarked.describe()

In [None]:
# Task 6
# Find the mode of Embarked
titanic_df.Embarked.value_counts()

In [None]:
# Task 7
# Use the .fillna() method to impute missing values
titanic_df.Embarked.fillna('S', inplace=True)

---

### Convert Categoricals into Binary Variables

In [None]:
titanic_df.loc[titanic_df['Sex'] == 'male', 'Sex'] = 0
titanic_df.loc[titanic_df['Sex'] == 'female', 'Sex'] = 1

In [None]:
titanic_df['Sex'].unique()

In [None]:
# Task 8
#
# Convert Embarked into a binary variable

In [None]:
titanic_df.Embarked.value_counts()

In [None]:
# convert "S" to 0, "C" to 1 and "Q" to 2 in Embarked column
titanic_df.loc[titanic_df["Embarked"] == "S", "Embarked"] = 0
titanic_df.loc[titanic_df["Embarked"] == "C", "Embarked"] = 1
titanic_df.loc[titanic_df["Embarked"] == "Q", "Embarked"] = 1

#print the unique values of Embarked column
print(titanic_df["Embarked"].unique())

---
### Essentials of Modeling (Overfitting and Cross Validation)

> The aim of all machine learning is generalization.

We want to train the algorithm on different data than we make predictions on. This is critical if we want to avoid overfitting. Overfitting is what happens when a model fits itself to "noise", not signal. Every dataset has its own quirks that don't exist in the full population. For example, if I asked you to predict the top speed of a car from its horsepower and other characteristics, and gave you a dataset that randomly had cars with very high top speeds, you would create a model that overstated speed. The way to figure out if your model is doing this is to evaluate its performance on data it hasn't been trained using.

Every machine learning algorithm can overfit, although some (like linear regression) are much less prone to it. If you evaluate your algorithm on the same dataset that you train it on, it's impossible to know if it's performing well because it overfit itself to the noise, or if it actually is a good algorithm.

Luckily, cross validation is a simple way to avoid overfitting. To cross validate, you split your data into some number of parts (or "folds"). Lets use 3 as an example. You then do this:
* Combine the first two parts, train a model, make predictions on the third.

* Combine the first and third parts, train a model, make predictions on the second.

* Combine the second and third parts, train a model, make predictions on the first.

This way, we generate predictions for the whole dataset without ever evaluating accuracy on the same data we train our model using.

In [None]:
titanic_df.columns.values

In [None]:
# The columns we'll use to predict the target
predictors_dim = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

A short note on using Scikit Learn

- There are model families from which we import Estimators
- Declaring an estimator object exposes its methods to us
- These include fit, transform and predict
- Paramters are defined when declaring the Estimator object

## Logistic Regression

One good way to think of logistic regression is that it takes the output of a linear regression, and maps it so it is between 0 and 1. We will do this with the logit function. Passing any value through the logit function will map it to a value between 0 and 1 by "squeezing" the extreme values. This is perfect for us, because we only care about two outcomes.

Sklearn has a class for logistic regression that we can use. We'll also make things easier by using an sklearn helper function to do all of our cross validation and evaluation for us.

In [None]:
algo2

In [None]:
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm
algo2 = LogisticRegression(random_state=1)

# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)

scores = cv.cross_val_score(algo2, titanic_df[predictors_dim], titanic_df["Survived"], cv=5)
# Take the mean of the scores (because we have one for each fold)

print(scores.mean())

In [None]:
algo2.

In [None]:
# Support Vector Machines
from sklearn.svm import SVC
svc_obj = SVC(kernel='linear')
svc_scores = cv.cross_val_score(svc_obj, titanic_df[predictors_dim], titanic_df["Survived"], cv=5)
print svc_scores.mean()

---
### Lets process the test case

Process titanic_test the same way we processed titanic.

This involved:

Replace the missing values in the "Age" column with the median age from the train set. The age has to be the exact same value we replaced the missing ages in the training set with (it can't be the median of the test set, because this is different). You should use titanic["Age"].median() to find the median.

Replace any male values in the Sex column with 0, and any female values with 1.

Fill any missing values in the Embarked column with S.

In the Embarked column, replace S with 0, C with 1, and Q with 2.

We'll also need to replace a missing value in the Fare column. Use .fillna with the median of the column in the test set to replace this. There are no missing values in the Fare column of the training set, but test sets can sometimes be different.

In [None]:
titanic_test = pd.read_csv('/root/hackerday/01_titanic/test.csv')
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_df["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2


In [None]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic_df[predictors_dim], titanic_df["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors_dim])

# Create a new dataframe 
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

print submission.head()

In [165]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [166]:
titanic_test.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2


In [163]:
alg.coef_

array([[-0.99196504,  2.60300021, -0.0341005 , -0.30937218, -0.07846287,
         0.00329483,  0.23985491]])

In [164]:
predictors_dim

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

y = -0.99 * Pclass + 2.60 * Sex - 0.03 * Age - ....