In [2]:
import numpy as np
import pandas as pd

import scipy.special

import bokeh.plotting
import bokeh.io

import bokeh_catplot

bokeh.io.output_notebook()

# Introduction to numpy and scipy

Numpy has a lot of functions for math. Scipy has special functions for scientific computing.

# numpy

The most important numpy data type: the numpy array.

In [4]:
# Make a numpy array from a list

my_ar = np.array([1, 2, 3, 4])

In [5]:
my_ar

array([1, 2, 3, 4])

In [6]:
type(my_ar)

numpy.ndarray

In [9]:
# Get data type of entries in the array

my_ar.dtype

dtype('int64')

In [12]:
my_ar = my_ar.astype(float)

In [13]:
my_ar

array([1., 2., 3., 4.])

In [14]:
my_ar.dtype

dtype('float64')

In [15]:
my_ar.max()

4.0

In [16]:
my_ar.min()

1.0

In [17]:
my_ar.mean()

2.5

In [18]:
my_ar.std()

1.118033988749895

In [19]:
np.std(my_ar)

1.118033988749895

In [21]:
n = 10

np.zeros(n)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [22]:
np.ones(n)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [23]:
# Make an empty array (initially populated with whatever was in memory)

np.empty(n)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [24]:
my_ar

array([1., 2., 3., 4.])

In [25]:
my_ar.shape

(4,)

In [27]:
# Make a 2D array from a list of lists

my_ar = np.array([[1, 2], [3, 4]])

In [28]:
my_ar.shape

(2, 2)

In [29]:
my_ar

array([[1, 2],
       [3, 4]])

You can also make a numpy array from parts of a dataframe. You might want to do this if you need to do some heavy calculations on entries in a dataframe to make the computation more efficient.

In [33]:
df = pd.read_csv('data/c_elegans_egg_xa.csv', comment='#')

In [34]:
df.head()

Unnamed: 0,food,area (sq. um)
0,high,1683
1,high,2061
2,high,1792
3,high,1852
4,high,2091


In [37]:
p = bokeh_catplot.ecdf(
    data=df,
    cats='food',
    val='area (sq. um)',
    y_axis_label='ECDF',
)

bokeh.io.show(p)

In [43]:
# Pull out a column from the dataframe as a numpy array: add .values at the end

xa_high = df.loc[df['food'] == 'high', 'area (sq. um)'].values
xa_low = df.loc[df['food'] == 'low', 'area (sq. um)'].values

In [42]:
type(xa_high)

numpy.ndarray

In [41]:
xa_high

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828])

In [44]:
xa_low

array([1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

In [45]:
# Indexing

xa_high[8]

1751

In [46]:
# Slicing

xa_high[::-1]

array([1828, 2131, 1851, 2030, 1930, 1660, 1721, 1740, 1752, 1863, 2141,
       1701, 1661, 1712, 1749, 1642, 1882, 1821, 1800, 1692, 1680, 1671,
       1683, 1833, 1800, 1930, 1910, 1821, 1840, 1787, 1683, 1809, 1951,
       1892, 1731, 1751, 1802, 1912, 1781, 2091, 1852, 1792, 2061, 1683])

In [47]:
xa_high[3::5]

array([1852, 1751, 1683, 1930, 1680, 1642, 2141, 1660, 1828])

Numpy arrays also support fancy indexing:

In [50]:
xa_high[[1, 19, 6]]

array([2061, 1800, 1912])

In [51]:
# Boolean indexing

xa_high[xa_high > 2000]

array([2061, 2091, 2141, 2030, 2131])

Numpy arrays are **mutable**.

In [52]:
my_ar = np.array([1, 2, 3, 4])

my_ar[2] = 6

my_ar

array([1, 2, 6, 4])

In [54]:
my_ar2 = my_ar

my_ar2[3] = 9

my_ar # also gets changed!!

array([1, 2, 6, 9])

This has interesting consequences if you use them in functions.

In [55]:
my_ar = np.array([1, 2, 3, 4]).astype(float)

def normalize(x):
    x /= np.sum(x)
    
normalize(my_ar)

my_ar

array([0.1, 0.2, 0.3, 0.4])

`my_ar` is modified after being passed through the function even though the function does not return anything!

This is in contrast to Python lists.

In [56]:
my_list = [1, 2, 3, 4, 5, 6, 7]
my_list_slice = my_list[1:-1]

my_list_slice[2] = 75

my_list_slice

[2, 3, 75, 5, 6]

In [57]:
my_list

[1, 2, 3, 4, 5, 6, 7]

In [60]:
my_ar = np.array([1, 2, 3, 4, 5, 6, 7])
my_ar_slice = my_ar[1:-1]

my_ar_slice[2] = 75

my_ar_slice

array([ 2,  3, 75,  5,  6])

In [61]:
my_ar

array([ 1,  2,  3, 75,  5,  6,  7])

A slice of an array is a **view**, not a copy.

A slice of a list is a **copy**.

In [62]:
# Make a copy

xa_high_copy = np.copy(xa_high)

xa_high_copy[10] = 20000

xa_high_copy

array([ 1683,  2061,  1792,  1852,  2091,  1781,  1912,  1802,  1751,
        1731, 20000,  1951,  1809,  1683,  1787,  1840,  1821,  1910,
        1930,  1800,  1833,  1683,  1671,  1680,  1692,  1800,  1821,
        1882,  1642,  1749,  1712,  1661,  1701,  2141,  1863,  1752,
        1740,  1721,  1660,  1930,  2030,  1851,  2131,  1828])

In [63]:
xa_high

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828])

In [64]:
# Doing math with arrays

# Element-wise division
np.array([5, 6, 7, 8]) / np.array([1, 2, 3, 4])

array([5.        , 3.        , 2.33333333, 2.        ])

In [65]:
# Multiply by a scalar

-4 * xa_high

array([-6732, -8244, -7168, -7408, -8364, -7124, -7648, -7208, -7004,
       -6924, -7568, -7804, -7236, -6732, -7148, -7360, -7284, -7640,
       -7720, -7200, -7332, -6732, -6684, -6720, -6768, -7200, -7284,
       -7528, -6568, -6996, -6848, -6644, -6804, -8564, -7452, -7008,
       -6960, -6884, -6640, -7720, -8120, -7404, -8524, -7312])

In [67]:
# Element-wise exponent
xa_high**2

array([2832489, 4247721, 3211264, 3429904, 4372281, 3171961, 3655744,
       3247204, 3066001, 2996361, 3579664, 3806401, 3272481, 2832489,
       3193369, 3385600, 3316041, 3648100, 3724900, 3240000, 3359889,
       2832489, 2792241, 2822400, 2862864, 3240000, 3316041, 3541924,
       2696164, 3059001, 2930944, 2758921, 2893401, 4583881, 3470769,
       3069504, 3027600, 2961841, 2755600, 3724900, 4120900, 3426201,
       4541161, 3341584])

In [68]:
xa_high.shape

(44,)

In [70]:
# Reshape

my_ar = xa_high.reshape((11, 4))

my_ar

array([[1683, 2061, 1792, 1852],
       [2091, 1781, 1912, 1802],
       [1751, 1731, 1892, 1951],
       [1809, 1683, 1787, 1840],
       [1821, 1910, 1930, 1800],
       [1833, 1683, 1671, 1680],
       [1692, 1800, 1821, 1882],
       [1642, 1749, 1712, 1661],
       [1701, 2141, 1863, 1752],
       [1740, 1721, 1660, 1930],
       [2030, 1851, 2131, 1828]])

In [71]:
list_of_lists = [[1, 2], [3, 4]]

In [72]:
list_of_lists[0][1]

2

In [74]:
# Index a numpy array like a matrix

my_ar[0, 1]

2061

In [75]:
# Get a row

my_ar[2, :]

array([1751, 1731, 1892, 1951])

In [76]:
# Get the values > 2000

my_ar[my_ar > 2000] # returns a 1D array

array([2061, 2091, 2141, 2030, 2131])

In [86]:
# Get the indices for values > 2000

np.where(my_ar > 2000)

(array([ 0,  1,  8, 10, 10]), array([1, 0, 1, 0, 2]))

In [77]:
# Concatenate arrays

np.concatenate((xa_high, xa_low))

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828,
       1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

In [78]:
# Other mathematical functions[

np.exp(xa_high / 1000)

array([5.38167681, 7.8538197 , 6.00144336, 6.37255189, 8.09300412,
       5.93578924, 6.76660849, 6.06175887, 5.76036016, 5.64629738,
       6.63262067, 7.03571978, 6.10434004, 5.38167681, 5.97151103,
       6.29653826, 6.1780334 , 6.7530888 , 6.88951024, 6.04964746,
       6.2526164 , 5.38167681, 5.31748262, 5.36555597, 5.43033051,
       6.04964746, 6.1780334 , 6.56662499, 5.16549017, 5.74885095,
       5.54003047, 5.26457279, 5.47942408, 8.50794132, 6.44303692,
       5.7661234 , 5.69734342, 5.59011579, 5.25931084, 6.88951024,
       7.61408636, 6.36618252, 8.42328589, 6.22143134])

In [79]:
np.cos(xa_high)

array([ 0.62656192,  0.9933696 ,  0.27501843,  0.03112568,  0.26681725,
       -0.96021239, -0.33430744,  0.29228295, -0.42404251, -0.99984597,
        0.72399324, -0.99748325,  0.84865001,  0.62656192, -0.84393482,
        0.56257847,  0.43231386,  0.99610114,  0.48702972, -0.99122275,
       -0.11903049,  0.62656192,  0.94691648, -0.73027654, -0.24968607,
       -0.99122275,  0.43231386, -0.98275172, -0.49500319, -0.64703425,
       -0.98592179, -0.61963892, -0.17156886,  0.00460656, -0.99936794,
        0.53296056,  0.90375673,  0.82939405,  0.3256673 ,  0.48702972,
        0.86222727, -0.824246  ,  0.5401501 ,  0.91834245])

In [80]:
np.sqrt(xa_high)

array([41.02438299, 45.39823785, 42.33202098, 43.03486958, 45.72745346,
       42.20189569, 43.72642222, 42.44997055, 41.84495191, 41.60528813,
       43.49712634, 44.17012565, 42.53234064, 41.02438299, 42.27292278,
       42.89522118, 42.67317659, 43.70354677, 43.93176527, 42.42640687,
       42.81354926, 41.02438299, 40.87786687, 40.98780306, 41.1339276 ,
       42.42640687, 42.67317659, 43.38202393, 40.52159918, 41.82104733,
       41.37632173, 40.75536774, 41.24318125, 46.27094121, 43.16248371,
       41.85689907, 41.71330723, 41.48493703, 40.74309757, 43.93176527,
       45.0555213 , 43.02324953, 46.16275555, 42.75511665])

In [81]:
# Vector dot product

np.dot(xa_high, xa_high)

146360195

In [82]:
np.pi

3.141592653589793

# Scipy functions (with numpy arrays)

In [83]:
scipy.special.erf(xa_high / 2000)

array([0.76597747, 0.8549794 , 0.7948931 , 0.80965587, 0.86074212,
       0.79209865, 0.8236209 , 0.79740973, 0.78433732, 0.77904847,
       0.81905337, 0.83227948, 0.79915793, 0.76597747, 0.7936263 ,
       0.80676772, 0.8021292 , 0.82316805, 0.8276577 , 0.79690821,
       0.80506817, 0.76597747, 0.76262579, 0.76514271, 0.76846912,
       0.79690821, 0.8021292 , 0.81673693, 0.7543863 , 0.78381257,
       0.77393853, 0.75980693, 0.77094188, 0.86995276, 0.81227529,
       0.78459935, 0.78143985, 0.77636944, 0.75952376, 0.8276577 ,
       0.84883448, 0.80941641, 0.86814949, 0.80384751])

In [84]:
scipy.special.j1(xa_high / 2000)

array([0.38458998, 0.44981526, 0.40452127, 0.4151153 , 0.45450465,
       0.40254955, 0.42543268, 0.40630586, 0.3971264 , 0.39347409,
       0.4220247 , 0.43198746, 0.40755061, 0.38458998, 0.40362616,
       0.41301841, 0.40967583, 0.42509329, 0.42847294, 0.40594955,
       0.41179014, 0.38458998, 0.38234311, 0.38402922, 0.38626839,
       0.40594955, 0.40967583, 0.42030899, 0.37687111, 0.39676249,
       0.38997736, 0.38046292, 0.38794099, 0.46215228, 0.41702773,
       0.39730824, 0.39512126, 0.39163698, 0.38027451, 0.42847294,
       0.44489108, 0.41494098, 0.46063968, 0.41091051])

NumPy lets you do math in a highly optimized way.

In [87]:
xa_high + 2 * xa_high

array([5049, 6183, 5376, 5556, 6273, 5343, 5736, 5406, 5253, 5193, 5676,
       5853, 5427, 5049, 5361, 5520, 5463, 5730, 5790, 5400, 5499, 5049,
       5013, 5040, 5076, 5400, 5463, 5646, 4926, 5247, 5136, 4983, 5103,
       6423, 5589, 5256, 5220, 5163, 4980, 5790, 6090, 5553, 6393, 5484])

In [88]:
np.sum(xa_high)

80053

Use NumPy and SciPy to do things instead of coding them yourself.

In [89]:
%load_ext watermark
%watermark -v -p numpy,scipy,pandas,bokeh,bokeh_catplot,jupyterlab

CPython 3.7.7
IPython 7.13.0

numpy 1.18.1
scipy 1.4.1
pandas 0.24.2
bokeh 2.0.2
bokeh_catplot 0.1.7
jupyterlab 1.2.6
