# Jupyter notebooks

This is an IPython notebook, running a python kernel underneath.

The state of the python interpreter, all the variables, class and function definitions are preserved between the cell executions. In case you want to reset that state (e.g. to check whether your code works corretrly when the cells are executed in the direct order), go to `Kernel`->`Restart kernel...`

The most useful features of Jupyter:
 - contextual help (hit `TAB`)
 - quick access to documentation (`SHIFT+TAB`)

# Numpy and vectorized computing

Almost any machine learning model requires some computational heavy lifting usually involving linear algebra problems. Unfortunately, raw python is terrible at this because each operation is interpreted at runtime.

So instead, we'll use `numpy` - a library that lets you run blazing fast computation with vectors, matrices and other tensors. The god oject here is `numpy.ndarray`:

**please keep running all the code cells as you read**

In [1]:
import numpy as np

a = np.array([1,2,3,4,5])
b = np.array([5,4,3,2,1])
print("a = ", a)
print("b = ", b)

# math and boolean operations can applied to each element of an array
print("a + 1 =", a + 1)
print("a * 2 =", a * 2)
print("a == 2", a == 2)
# ... or corresponding elements of two (or more) arrays
print("a + b =", a + b)
print("a * b =", a * b)

a =  [1 2 3 4 5]
b =  [5 4 3 2 1]
a + 1 = [2 3 4 5 6]
a * 2 = [ 2  4  6  8 10]
a == 2 [False  True False False False]
a + b = [6 6 6 6 6]
a * b = [5 8 9 8 5]


In [5]:
# Your turn: compute half-products of a and b elements (halves of products)

def half_product(a, b):
    # your code here
    return a * b / 2
    #raise NotImplementedError

Make sure the automatic checks are passed:

In [6]:
np.testing.assert_almost_equal(
    half_product(
        np.array([ 1.3549439 , -1.69765972,  0.32111273, -0.2703243 , -0.38928596]),
        np.array([-1.50693732,  0.69443195,  0.90948151,  1.1717395 ,  1.4069652 ])
    ),
    np.array([-1.02090776, -0.58945457,  0.14602305, -0.15837483, -0.2738559 ])
)

print("a =", a)
print("b =", b)
print("a * b / 2 = ", half_product(a, b))

a = [1 2 3 4 5]
b = [5 4 3 2 1]
a * b / 2 =  [2.5 4.  4.5 4.  2.5]


In [7]:
# compute elementwise quotient between squared a and (b plus 1), return the result

def a_squared_over_b_plus_1(a, b):
    # your code here
    return a**2 / (b+1)
    #raise NotImplementedError

Make sure the automatic checks are passed:

In [8]:
np.testing.assert_almost_equal(
    a_squared_over_b_plus_1(
        np.array([ 1.3549439 , -1.69765972,  0.32111273, -0.2703243 , -0.38928596]),
        np.array([-1.50693732,  0.69443195,  0.90948151,  1.1717395 ,  1.4069652 ])
    ),
    np.array([-3.62149895,  1.70089364,  0.05400072,  0.03364825,  0.06296043])
)

print("a = ", a)
print("b = ", b)
print("a**2 / (b + 1)", a_squared_over_b_plus_1(a, b))

a =  [1 2 3 4 5]
b =  [5 4 3 2 1]
a**2 / (b + 1) [ 0.16666667  0.8         2.25        5.33333333 12.5       ]


---------

There's a number of functions to create arrays of zeros, ones, ascending/descending numbers etc.:

In [None]:
np.zeros(shape=(3, 4))

In [None]:
np.ones(shape=(2, 5))

In [None]:
np.arange(3, 15, 2.5) # start, stop, step

In [None]:
np.linspace(0, 10, 11) # divide [0, 10] interval into 11 points

You can easily reshape arrays:

In [None]:
np.arange(24).reshape(2, 3, 4)

Array dimensions are automatically broadcast when doing mathematical operations:

In [None]:
np.arange(3).reshape(1, 3) - np.arange(3).reshape(3, 1)

You can use broadcasting in many ways. E.g. to raise a matrix to a set of powers elementwise:

In [None]:
A = np.array([
    [1, 2],
    [3, 5]
])
powers = np.arange(1, 5)

A.reshape(1, 2, 2)**powers.reshape(4, 1, 1)

There is also a number of ways to stack arrays together. E.g. `np.concatenate` joins arrays along **an existing** axis:

In [None]:
matrix1 = np.arange(18).reshape(6, 3) # array of shape (6, 3)
matrix2 = -np.arange(12).reshape(6, 2) # array of shape (6, 2)

np.concatenate([matrix1, matrix2], axis=1) # result of shape (6, 5)

While `np.stack` adds **a new dimension** to the result:

In [None]:
np.stack([
    np.arange(5), # array of shape (5,)
    np.linspace(0, 1, 5) # array of shape (5,)
], axis=1) # result of shape (5, 2)

Any matrix can be transposed easily:

In [None]:
print(matrix2)
print('---')
print(matrix2.T)

In [None]:
print('matrix2.shape =', matrix2.shape)
print('matrix2.T.shape =', matrix2.T.shape)

In [None]:
# Your turn!
# Using the methods you've learned (array initialization,
# reshaping and concatenation), write a function that composes
# and returns the following matrix:
#
#              0   7  14  21  28  0  0
#              1   8  15  22  29  0  0
#              2   9  16  23  30  0  0
#              3  10  17  24  31  3  3
#              4  11  18  25  32  3  3
#              5  12  19  26  33  3  3
#              6  13  20  27  34  3  3


def compose_matrix():
    # your code here
    raise NotImplementedError

In [None]:
np.testing.assert_equal(
    compose_matrix(),
    np.array([[0,   7,  14,  21,  28,  0,  0],
              [1,   8,  15,  22,  29,  0,  0],
              [2,   9,  16,  23,  30,  0,  0],
              [3,  10,  17,  24,  31,  3,  3],
              [4,  11,  18,  25,  32,  3,  3],
              [5,  12,  19,  26,  33,  3,  3],
              [6,  13,  20,  27,  34,  3,  3]])
)

Linear algebra:

In [None]:
a = np.arange(3)
B = np.arange(12).reshape(4, 3)
print("a =", a)
print("B =\n", B)
print("Dot product (a*a):", a @ a) # or: np.dot(a, a)
print("Matrix-vector (B*a):", B @ a) # or: np.matmul(B, a)
print("Matrix-matrix (B*B.T):\n", B @ B.T) # or: np.matmul(B, B.T)

There's also a bunch of pre-implemented operations including logarithms, trigonometry and aggregations.

In [None]:
a = np.array([1,2,3,4,5])
b = np.array([5,4,3,2,1])
print("numpy.sum(a) = ", np.sum(a))
print("numpy.mean(a) = ", np.mean(a))
print("numpy.min(a) = ",  np.min(a))
print("numpy.argmin(b) = ", np.argmin(b))  # index of minimal element
print("numpy.dot(a,b) = ", np.dot(a, b))      # dot product. Also used for matrix/tensor multiplication
print("numpy.unique(['male','male','female','female','male']) = ", np.unique(['male','male','female','female','male']))

# and tons of other stuff. see http://bit.ly/2u5q430 .

In [None]:
# most of this functions are also implemented as members of numpy arrays, e.g.:
print('a.min() =', a.min())
print('a.mean() =', a.mean())

# pandas

Pandas is a library that helps you load the data, prepare it and perform some lightweight analysis. It is built ontop of numpy. The god object here is the pandas.DataFrame - a 2d table with batteries included.

In the cell below we use it to read the data on the infamous Titanic shipwreck.

In [None]:
# check out the file contents:
!head ../../data/1.1.4-DataHandling/train.csv

In [None]:
import pandas as pd
data = pd.read_csv("../../data/1.1.4-DataHandling/train.csv", index_col='PassengerId') # this yields a pandas.DataFrame

In [None]:
# Selecting rows
head = data[:10]
head

### About the data
Here's some of the columns

 - Name - a string with person's full name
 - Survived - 1 if a person survived the shipwreck, 0 otherwise.
 - Pclass - passenger class. Pclass == 3 is cheap'n'cheerful, Pclass == 1 is for moneybags.
 - Sex - a person's gender
 - Age - age in years, if available
 - Sibsp - number of siblings on a ship
 - Parch - number of parents on a ship
 - Fare - ticket cost
 - Embarked - port where the passenger embarked
   - C = Cherbourg; Q = Queenstown; S = Southampton

In [None]:
# table dimensions
print("len(data) = ", len(data))
print("data.shape = ", data.shape)

In [None]:
# select a single row
print(data.loc[4])

In [None]:
# select a single column.
ages = data["Age"]
print(ages[:10])  # alternatively: data.Age

In [None]:
# select several columns and rows at once
data.loc[5:10, ("Fare", "Pclass")]    # alternatively: data[["Fare","Pclass"]].loc[5:10]

Some columns contain NaN values - this means that there is no data there. For example, passenger #5 has unknown age. To simplify the future data analysis, we'll replace NaN values by using pandas fillna function.

**Important note: we do this so easily because it's a tutorial. In general, you think twice before you modify data like this.**

In [None]:
data.iloc[5]

In [None]:
data['Age'] = data['Age'].fillna(value=data['Age'].mean())
data['Fare'] = data['Fare'].fillna(value=data['Fare'].mean())

In [None]:
data.iloc[5]

The functions `max`, `min`, `mean`, etc. are also available:

In [None]:
print("Max ticket price: ", data["Fare"].max())

print("\nThe guy who paid the most:\n", data.loc[data['Fare'].idxmax()])

Boolean operations produce boolean arrays:

In [None]:
data['Age'] < 30

Boolean indexing comes very handy for quickly selecting subsets of data:

In [None]:
print("data[(data['Age'] < 18) & (data['Sex'] == 'male')] = (below)") # select male children
data.loc[(data['Age'] < 18) & (data['Sex'] == 'male')]

# Plots and matplotlib

Using python to visualize the data is covered by yet another library: `matplotlib`.

Just like python itself, matplotlib has an awesome tendency of keeping simple things simple while still allowing you to write complicated stuff with convenience (e.g. super-detailed plots or custom animations).

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# line plot
plt.plot([0,1,2,3,4,5],[0,1,4,9,16,25]);

In [None]:
#scatter-plot
x = np.arange(5)
print("x =", x)
print("x**2 =", x**2)
print("plotting x**2 vs x:")
plt.scatter(x, x**2)

plt.show()  # show the first plot and begin drawing next one
plt.plot(x, x**2);

In [None]:
# histogram - showing data density
plt.hist([0,1,1,1,2,2,3,3,3,3,3,4,4,5,5,5,6,7,7,8,9,10])
plt.show()

plt.hist([0,1,1,1,2,2,3,3,3,3,3,4,4,5,5,5,6,7,7,8,9,10], bins=5);

In [None]:
# plot a histogram of age and a histogram of ticket fares on separate plots

ages = data["Age"]
fares = data["Fare"]

# your code here
raise NotImplementedError

In [None]:
plt.scatter(data['Age'],
            np.log1p(data['Fare']),
            c=data['Survived'], # maps different colors depending on whether a person survived
            s=2, # change the marker size
            cmap='bwr'); # select colormap

# machine learning with scikit-learn

Scikit-learn is *the* tool for simple machine learning pipelines.

It's a single library that unites a whole bunch of models under the common interface:

 - Create: **`model = sklearn.whatever.ModelNameHere(parameters_if_any)`**
 - Train: **`model.fit(X,y)`**
 - Predict: **`model.predict(X_test)`**

It also contains utilities for feature extraction, quality estimation or cross-validation.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


features = data[['Age', 'SibSp']].copy()

# your code here
raise NotImplementedError


answers = data["Survived"]

model = KNeighborsClassifier(n_neighbors=5)
model.fit(features[:-100], answers[:-100])

test_predictions = model.predict(features[-100:])
print("Test accuracy:", accuracy_score(answers[-100:], test_predictions))

**Final quest:** tune n_neighbors and add more features to achieve accuracy of at least 0.75

**Hint 1:** for string features like "Sex" or "Embarked" you will have to compute some kind of numeric representation. For example, 1 if male and 0 if female or vice versa

**Hint 2:** features like "Age" contain NaN values. You can replace them with `replaced = data["Age"].fillna(some_number)`.

In [None]:
model.fit(features[:-100], answers[:-100])

test_predictions = model.predict(features[-100:])
score = accuracy_score(answers[-100:], test_predictions)
assert score >= 0.75, score

In [None]:
model.fit(features[:-100], answers[:-100])

test_predictions = model.predict(features[-100:])
score = accuracy_score(answers[-100:], test_predictions)
assert score >= 0.8, score

In [None]:
model.fit(features[:-100], answers[:-100])

test_predictions = model.predict(features[-100:])
score = accuracy_score(answers[-100:], test_predictions)
assert score >= 0.85, score

# Bonus part

In [None]:
from matplotlib.ticker import ScalarFormatter

data['qFare'] = pd.qcut(data.Fare, 20)

sur_vs_price = data.groupby('qFare').Survived.mean()
sur_vs_price_e = data.groupby('qFare').Survived.std() \
                        / data.groupby('qFare').Survived.count()**0.5

fig = plt.figure(figsize=(12, 7))
plt.errorbar(x=sur_vs_price.index.categories.mid,
             y=sur_vs_price.values,
             yerr=sur_vs_price_e.values,
             xerr=(
                 pd.IntervalIndex(sur_vs_price.index).right - 
                 pd.IntervalIndex(sur_vs_price.index).left
               ) / 2,
             fmt='o')
plt.gca().set_xscale('log')
plt.gca().xaxis.set_major_formatter(ScalarFormatter())
plt.gca().set_xticks(
              list(range(3, 10)) +
              list(range(10, 100, 10)) +
              list(range(100, 700, 100))
            )

plt.xlabel('Fare')
plt.ylabel('Survival probability');