# Numpy

In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

## Creating numpy arrays

In [None]:
v = np.array([1,2,3,4])  # a vector
v

In [None]:
M = np.array([ [1,2],[3,4] ]) # a matrix
M

In [None]:
print(type(v))
print(type(M))

In [None]:
print(v.shape)
print(M.shape) # the shape is different

In [None]:
print(v.size)
print(M.size) # same number of elements

In [None]:
print(v.dtype) # data type
v[0] = "hello" # therefore, does not work

In [None]:
z = np.array(['1', '2', '3']) # does work
z

In [None]:
v = np.array([1,2,3], dtype='str') # also works (common type are: int, float, complex, bool, object)
v = v.astype(int)
v

## Array generating functions

In [None]:
np.arange(0, 10, 1) # creates a range. arguments: start, stop (stop - step), step

In [None]:
np.linspace(0, 10, 26) # creates a range. arguments: start, stop, num

In [None]:
np.logspace(0, 10, 10, base=np.e) # creates a range in log-space. arguments: start, stop, num

In [None]:
np.random.rand(5,5) # uniform random numbers in [0,1]. arguments: shape

In [None]:
np.random.randn(5,5) # standard normal distributed random numbers. arguments: shape

In [None]:
np.diag([1,2,3]) # a diagonal matrix. arguments: array (if 2D, the diag is returned)

In [None]:
np.diag([1,2,3], k=1) # diagonal with offset from the main diagonal. arguments: array, offset

In [None]:
np.zeros((3,3)) # zeros

In [None]:
np.ones((3,3)) # ones

In [None]:
np.zeros((3,3), dtype=bool) # boolean data type

## Manipulating arrays - Indexing

In [None]:
v = np.array([1,2,3,4])
M = np.array([[1,2],[3,4]])
print(v)
print()
print(M)

In [None]:
v[0] # v is a vector, and has only one dimension, taking one index

In [None]:
M[1,1] # M is a matrix, or a 2 dimensional array, taking two indices 

In [None]:
M[1,:] # row 1

In [None]:
M[:,1] # column 1

In [None]:
M[0,0] = 10 # assign new values to elements in an array using indexing
M

In [None]:
M[1,:] = 0 # also works for rows
M

In [None]:
M[:,1] = -1 # and columns
M

## Manipulating arrays - Slicing

In [None]:
v = np.arange(8)
v

In [None]:
v[1:3] # lower:upper:step

In [None]:
v[1:3] = [-2,-3] # assign new values to elements in an array using slicing
v

In [None]:
v[::] # lower, upper, step all take the default values

In [None]:
v[::2] # step is 2, lower and upper defaults to the beginning and end of the array

In [None]:
v[:3] # first three elements

In [None]:
v[3:] # elements from index 3

In [None]:
v[-1] # the last element in the array

In [None]:
v[-3:] # the last three elements

In [None]:
M = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])
M

In [None]:
M[1:3, 1:3] # a block from the original array

In [None]:
M[::2, ::2] # strides

## Fancy indexing (makes a copy)

In [None]:
row_indices = [1, 2, 3] # fancy indexing is the name for when an array or list is used in-place of an index:
M[row_indices,:]

In [None]:
col_indices = [1, 2, -1] # remember, index -1 means the last element
M[row_indices, col_indices]

In [None]:
# We can also use index masks: If the index mask is an Numpy array of data type bool, 
# then an element is selected (True) or not (False) depending on the value of the 
# index mask at the position of each element:

mask = np.array([1,0,1,0,0,1,1,0], dtype=bool) 
print(v)
print(mask)
print(v[mask])

In [None]:
mask = (M>2) & (M < 13) # conditionally select elements from an array, using for example comparison operators (need the parentheses)
print(mask)
print(M[mask])

In [None]:
indices = np.where(mask) # the index mask can be converted to position index using the where function
print(indices)
print(M[indices])

## Element-wise array-array operations

In [None]:
# Default in Python; in Matlab one needs . for element-wise operations

v = np.arange(4)
M = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])

print(v)
print()
print(M)

In [None]:
M * 2 # use the usual arithmetic operators to multiply, add, subtract, and divide arrays with scalar numbers

In [None]:
M + 2

In [None]:
M * M # when adding, subtracting, multiplying and dividing arrays with each other, the default behaviour is element-wise operations

In [None]:
M + M

In [None]:
M * v # why does this work?

In [None]:
M + v

## Broadcasting rules

Broadcasting refers to the method that Numpy uses to allow array arithmetic between arrays with a different shape or size.

In [None]:
# scalar and one-dimensional
a = np.array([1, 2, 3])
b = 2
print('a = ')
print(a)
print()
print('b = ')
print(b)
print()
print('a + b = ')
print(a+b)

In [None]:
# scalar and two-dimensional
a = np.array([[1, 2, 3], [1, 2, 3]])
b = 2
print('a = ')
print(a)
print()
print('b = ')
print(b)
print()
print('a + b = ')
print(a+b)

In [None]:
# one-dimensional and two-dimensional
a = np.array([[1, 2, 3], [1, 2, 3]])
b = np.array([1, 2, 3])
print('a = ')
print(a)
print()
print('b = ')
print(b)
print()
print('a + b = ')
print(a+b)

In [None]:
# mismatch:
a = np.ones((3, 2))
b = np.arange(3)
print('a = ')
print(a)
print()
print('b = ')
print(b)
print()
print('a + b = ')
print(a+b)

## Linear algebra

In [None]:
v = np.arange(4)
M = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])
print(v)
print()
print(M)

In [None]:
np.dot(M, M) # matrix-matrix multiplication

In [None]:
a = np.dot(M, v) # matrix-vector multiplication
print(type(a))
print(a)

In [None]:
np.dot(v, v) # inner vector multiplication

In [None]:
M = np.matrix(M) # cast to matrix
M

In [None]:
v = np.matrix(v).T # cast to matrix; make it a column vector via transpose
v

In [None]:
M * M # matrix-matrix multiplication

In [None]:
M * v # matrix-vector multiplication

In [None]:
v.T * v # inner vector multiplication

In [None]:
v * M # if we try to add, subtract or multiply objects with incomplatible shapes we get an error

In [None]:
np.linalg.det(M) # determinant

In [None]:
np.linalg.inv(M) # inverse

## Data processing

In [None]:
M = np.arange(16).reshape((8,2))
M

In [None]:
print(np.mean(M))

print(M.mean())

print(np.mean(M, axis=0)) # can specify several axes

print(M.mean(axis=0))

print(M.mean(axis=1))

In [None]:
print(M)

print(np.median(M))

print(np.std(M))

print(np.var(M))

print(np.min(M))

print(np.max(M))

print(np.sum(M)) # sum up all elements

print(np.cumsum(M)) # cummulative sum

print(np.prod(M + 1)) # product of all elements

## Reshaping, resizing and stacking arrays

In [None]:
v = np.array([[1, 2, 3, 4]])
v2 = np.array([[11, 12, 13, 14]])
M = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]])

print(v)
print()
print(v2)
print()
print(M)

In [None]:
n, m = M.shape # make into vector:
M.reshape((1,n*m))

In [None]:
M = np.arange(1,17).reshape(4,4)
M

In [None]:
M.flatten() # make into vector (again returns a copy)
M.ravel() # make into vector (does not return a copy)
M

In [None]:
M.reshape((2,8)) # reshape into 2 rows, 8 columns

In [None]:
print(np.repeat(v, 3)) # repeat each element 3 times

print(np.tile(v, 3)) # tile the matrix 3 times

In [None]:
print(np.concatenate((v, v2))) # concatenate 2 vectors (default across first dimension)
print()
print(np.concatenate((v, v2), axis=1)) # concatenate 2 vectors across second dimension

In [None]:
np.vstack((M,v)) # stack vertically

In [None]:
np.hstack((M,v)) # stack horizontally --> dimension mismatch

In [None]:
np.hstack((M,v.T)) # stack horizontally 

# Scipy + Matplotlib

## Curve fitting

In [None]:
from scipy.optimize import curve_fit

def fitFunc(t, a, b, c):
    return a*np.exp(-b*t) + c
    
t = np.linspace(0,4,50)
temp = fitFunc(t, 5.0, 1.5, 0.5)
noisy = temp + 0.25*np.random.normal(size=len(temp))

plt.figure()
plt.errorbar(t, noisy, fmt='ro', yerr=0.2) # plot the data as red circles with vertical errorbars
plt.ylabel('Temperature (C)')
plt.xlabel('time (s)')
plt.xlim(0,4.1)

fitParams, fitCovariances = curve_fit(fitFunc, t, noisy)
print(' fit coefficients:\n', fitParams)
print(' Covariance matrix:\n', fitCovariances)
plt.plot(t, fitFunc(t, *fitParams)) # now plot the best fit curve

## Interpolation

In [None]:
from scipy.interpolate import interp1d

def f(x):
    return np.sin(x)
    
n = np.arange(0, 10)  
x = np.linspace(0, 9, 100)

y_meas = f(n) + 0.1 * sp.randn(len(n)) # simulate measurement with noise
y_real = f(x)

linear_interpolation = interp1d(n, y_meas)
y_interp1 = linear_interpolation(x)

cubic_interpolation = interp1d(n, y_meas, kind='cubic')
y_interp2 = cubic_interpolation(x)

plt.figure()
plt.plot(n, y_meas, 'bs', label='noisy data')
plt.plot(x, y_real, 'k', lw=2, label='true function')
plt.plot(x, y_interp1, 'r', label='linear interp')
plt.plot(x, y_interp2, 'g', label='cubic interp')
plt.legend(loc=3)

## Fourier transform

In [None]:
from scipy.fftpack import fft

N = 300 # number of samplepoints
T = 1 / 1000.0 # sample spacing
x = np.linspace(0.0, N*T, N)
y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)

plt.figure()
plt.plot(x,y)
plt.title('Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude (a.u.)')

yf = fft(y)
xf = np.linspace(0.0, 1.0/(2.0*T), int(N/2))

plt.figure()
plt.plot(xf, 2.0/N * np.abs(yf[:N//2]))
plt.title('Fourier transform')
plt.xlabel('Frequency (s)')
plt.ylabel('Amplitude (Hz)')

## Statistics

In [None]:
from scipy import stats

# T-TEST:

x1 = np.random.normal(loc=0, scale=1, size=50)
x2 = np.random.normal(loc=.5, scale=1, size=50)

plt.figure()
plt.hist(x1, bins=10, alpha=0.5, label='x1')
plt.hist(x2, bins=10, alpha=0.5, label='x2')
plt.legend()

t, p = sp.stats.ttest_ind(x1, x2)
print('x1 vs x2\nind. t-test: t = {}; p = {}'.format(round(t,4), round(p,4)))

t, p = sp.stats.ttest_rel(x1, x2)
print('x1 vs x2\npaired t-test: t = {}; p = {}'.format(round(t,4), round(p,4)))

t, p = sp.stats.ttest_1samp(x1, 0)
print('x1 vs 0\n1 sample t-test: t = {}; p = {}'.format(round(t,4), round(p,4)))

t, p = sp.stats.ttest_1samp(x2, 0)
print('x2 vs 0\n1 sample t-test: t = {}; p = {}'.format(round(t,4), round(p,4)))

In [None]:
# REGRESSION:

x = np.random.normal(loc=0, scale=1, size=50)
y = x + np.random.normal(loc=0, scale=2, size=50)

r, p = sp.stats.pearsonr(x, y)
print('correlation x and y\npearson r = {}; p = {}'.format(round(r,4), round(p,4)))

r, p = sp.stats.spearmanr(x, y)
print('correlation x and y\nspearman r = {}; p = {}'.format(round(r,4), round(p,4)))

plt.figure()
plt.plot(x,y,'o')
plt.xlabel('x (a.u.)')
plt.ylabel('y (a.u.)')

# calc the trendline
coefs = np.polyfit(x, y, 1)
poly = np.poly1d(coefs)
plt.plot(x, poly(x), "r")

In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv') # Get this from kaggle: https://www.kaggle.com/c/titanic

df.head()

In [None]:
df['Pclass']

In [None]:
df['Pclass'].unique()

In [None]:
(df.loc[df['Pclass']==1, 'Sex'] == 'male').mean()

In [None]:
age = []
for s in [1,0]:
    age.append(df.loc[df['Survived']==s, 'Age'].mean())
age

In [None]:
age_male = []
age_female = []
for survived in df['Survived'].unique():
    for sex in df['Sex'].unique():
        if sex == 'male':
            age_male.append(df.loc[(df['Survived']==survived)&(df['Sex']==sex), 'Age'].mean())
        elif sex == 'female':
            age_female.append(df.loc[(df['Survived']==survived)&(df['Sex']==sex), 'Age'].mean())
print(age_male)
print(age_female)

In [None]:
df.groupby(['Survived', 'Sex']).mean()

In [None]:
cuts = np.linspace(0, 100, 11)
print(cuts)

pd.cut(df.Age, cuts)

In [None]:
def plot_age_by_survived(df, bins, label=None):
    k = df.groupby([pd.cut(df.Age, bins)]).mean()
    plt.errorbar(k.Age, k.Survived*100, fmt='-o', label=label)
    if label is not None:
        plt.legend()
    plt.xlabel('Age (years)')
    plt.ylabel('Survived (%)')

plot_age_by_survived(df, cuts)

In [None]:
for sex in ['female', 'male']:
    plot_age_by_survived(df.loc[df['Sex']==sex,:], bins=cuts, label=sex)