#NumPy

**--> We know lists, sets, tuples, dictionaries.**

a. Very inefficient, 10x to 100x slower.

b. Designed to store heterogenous data.

c. No low-level h/w mechanisms to accelerate operations on lists.



**--> Intended to bring performance and functionality improvements.**

1. NumPy at user (programming) level:

  1. Provide implementations of many functions across linear algebra, statistics,...
  2. Efficiently broadcast operations across dimensions.

2. NumPy at programming languages level:

  1.   Enable other packages to use numpy arrays as an efficient data interface. (written in C)
  2.   Efficiently process data without type-checking overhead.


3. NumPy at hardware and interface level:

  1. Enable easy file save and load of n-d arrays.
  2. Efficiently store n-d arrays in vectorised form to benifit from DRAM locality.

In [None]:
import numpy as np

##Comparing performance of Lists, etc.

In [None]:
N = 10000000

In [None]:
%%time
list_ = list(range(N))
for i in range(N):
  list_[i] = list_[i] * list_[i]

CPU times: user 3.02 s, sys: 178 ms, total: 3.2 s
Wall time: 3.21 s


In [None]:
%%time
list_ = list(range(N))
list_ = [item * item for item in list_]    #It is 33% faster than previous

CPU times: user 1.14 s, sys: 356 ms, total: 1.5 s
Wall time: 1.5 s


In [None]:
%%time
list_ = list(range(N))
list_= map(lambda x: x * x, list_)    #Map() applies to every argument of list, and with lambda we can send any inline function
#Use for large datasets, gives 5x better improvement than original.

CPU times: user 265 ms, sys: 163 ms, total: 428 ms
Wall time: 430 ms


In [None]:
%%time
list_ = list(range(N))
list_sum = 0
for item in list_:
  list_sum += item

CPU times: user 1.45 s, sys: 158 ms, total: 1.61 s
Wall time: 1.61 s


In [None]:
%%time
list_ = list(range(N))
list_sum = sum(list_)   #predefined functions are way faster

CPU times: user 325 ms, sys: 163 ms, total: 488 ms
Wall time: 487 ms


In [None]:
%%time
arr = np.arange(N)
arr = arr * arr          #Multiply item wise for any dimensions

CPU times: user 46.2 ms, sys: 2.89 ms, total: 49.1 ms
Wall time: 54.2 ms


In [None]:
%%time
arr = np.arange(N)
arr_sum = np.sum(arr)

CPU times: user 31.1 ms, sys: 3.88 ms, total: 35 ms
Wall time: 36.7 ms


##High Dimensional Array

We index dimensions backwards in the order we added them.

**For 3-d array:**

Dim 2 -> x-axis

Dim 1 -> y-axis (downwards)

Dim 0 -> z-axis (along which we get multiple matrix)

Shape of array = Size of array

In [None]:
arr = np.arange(5)
print(arr, type (arr))

[0 1 2 3 4] <class 'numpy.ndarray'>


In [None]:
arr = np.array([0, 2, 4, 6, 8])
arr

array([0, 2, 4, 6, 8])

In [None]:
arr.dtype  #If arr has any one element as float, then whole array is of type float64

dtype('int64')

In [None]:
arr.ndim

1

In [None]:
arr.shape

(5,)

In [None]:
arr.size

5

In [None]:
arr.itemsize

8

In [None]:
arr2d = np.array([
                  [1, 2, 3],
                  [4, 5, 6]
])
arr2d

array([[1, 2, 3],
       [4, 5, 6]])

In [None]:
arr2d.ndim

2

In [None]:
arr2d.shape

(2, 3)

In [None]:
arr2d.size

6

In [None]:
arr3d = np.array([
                  [
                   [1, 2, 3],
                   [4, 5, 6]
                  ],
                  [
                   [7, 8, 9],
                   [10, 11, 12]
                  ]
])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [None]:
arr3d.ndim

3

In [None]:
arr3d.size

12

In [None]:
arr3d.shape

(2, 2, 3)

In [None]:
np.ones((2, 3, 4))

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]])

In [None]:
172 * np.ones((2, 3))   #multiply by broadcasting

array([[172., 172., 172.],
       [172., 172., 172.]])

In [None]:
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [None]:
np.random.randn(2, 3)   #random number of normal distribution with mean = 0 and sd = 1

array([[-0.8298848 , -0.23685863,  1.77352052],
       [-1.01977864,  1.02954827, -1.59436032]])

In [None]:
np.random.rand(2, 3)   #num between 0 to 1 of uniform distributiton

array([[0.9515171 , 0.15094756, 0.09665455],
       [0.63606107, 0.14670837, 0.17702847]])

In [None]:
np.random.randint(0, 100, (2, 3))  #gives random int b/w 0 to 100, with shape (2,3)

array([[78, 79, 71],
       [30, 33,  0]])

In [None]:
np.arange(7, 71, 10)   #get10 nums in range 7-70

array([ 7, 17, 27, 37, 47, 57, 67])

In [None]:
np.linspace(7, 70, 10)  #gives float no. from range 7-70 distributed equally

array([ 7., 14., 21., 28., 35., 42., 49., 56., 63., 70.])

In [None]:
np.array([True, False, True])   #np arrays can have bool too

array([ True, False,  True])

In [None]:
str_arr = np.array(['1.4', '2.1', '1.1'])    #np array of strings of 3 unit length

In [None]:
arr = np.array(str_arr, dtype = 'float')  #converting str arr to float arr

In [None]:
arr

array([1.4, 2.1, 1.1])

##Indexing

In [None]:
print(arr3d)

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]


In [None]:
arr3d[0, 1, 2]

6

In [None]:
i = 1
j = 0
k = 2
arr3d[i, j, k]

9

In [None]:
arr3d[1, :, :]

array([[ 7,  8,  9],
       [10, 11, 12]])

In [None]:
arr3d[:, 1, :]

array([[ 4,  5,  6],
       [10, 11, 12]])

In [None]:
arr3d[:, :, 0:2]

array([[[ 1,  2],
        [ 4,  5]],

       [[ 7,  8],
        [10, 11]]])

In [None]:
arr3d % 2 == 0

array([[[False,  True, False],
        [ True, False,  True]],

       [[False,  True, False],
        [ True, False,  True]]])

In [None]:
arr3d[arr3d % 2 == 0]

array([ 2,  4,  6,  8, 10, 12])

In [None]:
arr3d[(arr3d % 2 == 1) & (arr3d > 3)]

array([ 5,  7,  9, 11])

In [None]:
arr_slice = arr3d[:,:,0:2]

In [None]:
print(type(arr_slice))  #but it is not a deep copy, but just the reference copy. So any change to it will be shown in original.

<class 'numpy.ndarray'>


In [None]:
arr_slice.ndim

3

In [None]:
arr_slice.shape

(2, 2, 2)

In [None]:
arr_slice[0, 0, 1] = 69

In [None]:
arr_slice

array([[[ 1, 69],
        [ 4,  5]],

       [[ 7,  8],
        [10, 11]]])

In [None]:
arr3d

array([[[ 1, 69,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [None]:
arr_slice = np.copy(arr3d[:,:,0:2])

In [None]:
arr_slice

array([[[ 1, 69],
        [ 4,  5]],

       [[ 7,  8],
        [10, 11]]])

In [None]:
arr_slice[0, 0, 1] = 2
arr_slice

array([[[ 1,  2],
        [ 4,  5]],

       [[ 7,  8],
        [10, 11]]])

In [None]:
arr3d

array([[[ 1, 69,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [None]:
arr = np.random.randint(0, 10, 5)
arr

array([8, 2, 6, 4, 1])

In [None]:
my_indices = [1, 3, 4]

In [None]:
arr[my_indices]

array([2, 4, 1])

##NumPy Operations

In [None]:
arr1 = np.random.rand(3, 4)
arr2 = np.random.rand(3, 4)

In [None]:
arr1

array([[0.45735825, 0.5497882 , 0.77187229, 0.46436311],
       [0.70994811, 0.14399094, 0.97481363, 0.95949001],
       [0.0752398 , 0.2914936 , 0.34935899, 0.69111993]])

In [None]:
arr2

array([[0.69990227, 0.62011333, 0.33815225, 0.84714904],
       [0.63718533, 0.49396816, 0.82513979, 0.14634831],
       [0.5191014 , 0.37023482, 0.88292126, 0.15783079]])

In [None]:
arr1 + arr2

array([[1.15726052, 1.16990153, 1.11002454, 1.31151215],
       [1.34713344, 0.63795909, 1.79995342, 1.10583832],
       [0.5943412 , 0.66172842, 1.23228026, 0.84895072]])

In [None]:
arr1 -arr2

array([[-0.24254402, -0.07032513,  0.43372003, -0.38278594],
       [ 0.07276278, -0.34997722,  0.14967383,  0.8131417 ],
       [-0.44386159, -0.07874122, -0.53356227,  0.53328914]])

In [None]:
arr1 * arr2

array([[0.32010608, 0.34093099, 0.26101035, 0.39338476],
       [0.45236852, 0.07112694, 0.80435751, 0.14041975],
       [0.03905709, 0.10792108, 0.30845648, 0.10908001]])

In [None]:
arr1/arr2

array([[0.65346016, 0.8865931 , 2.2826176 , 0.54814806],
       [1.11419406, 0.29149842, 1.18139209, 6.55620816],
       [0.1449424 , 0.78732087, 0.39568533, 4.37886626]])

In [None]:
np.exp(arr1)

array([[1.57989478, 1.73288595, 2.16381374, 1.59100057],
       [2.03388572, 1.15487364, 2.65067315, 2.61036487],
       [1.07814266, 1.33842507, 1.41815821, 1.9959496 ]])

In [None]:
np.log(np.exp(arr1))

array([[0.45735825, 0.5497882 , 0.77187229, 0.46436311],
       [0.70994811, 0.14399094, 0.97481363, 0.95949001],
       [0.0752398 , 0.2914936 , 0.34935899, 0.69111993]])

In [None]:
np.sin(arr1)

array([[0.44157941, 0.52250665, 0.69747815, 0.44785344],
       [0.65179442, 0.14349388, 0.82759729, 0.81889897],
       [0.07516883, 0.28738314, 0.34229559, 0.63740052]])

In [None]:
np.sqrt(arr1)

array([[0.67628267, 0.74147704, 0.87856262, 0.68144193],
       [0.84258419, 0.37946138, 0.9873265 , 0.97953561],
       [0.27429875, 0.53990147, 0.59106598, 0.83133623]])

In [None]:
arr_inv = 1 /  arr1

In [None]:
arr_inv

array([[ 2.1864698 ,  1.81888226,  1.29555111,  2.15348718],
       [ 1.40855364,  6.94488157,  1.02583712,  1.04222034],
       [13.29083743,  3.43060706,  2.86238516,  1.44692688]])

In [None]:
arr_inv = 1 / np.zeros((3, 4))
arr_inv

  """Entry point for launching an IPython kernel.


array([[inf, inf, inf, inf],
       [inf, inf, inf, inf],
       [inf, inf, inf, inf]])

In [None]:
np.isinf(arr_inv)

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

##Exercise on finding number of points outside n-dimensional sphere 

Find the area/4 of square which is not occupied with circle. Side = 2 unit, Radius = 1 unit.  (Look at first quadrant)

In [None]:
ndim = 2

In [None]:
npoints = 100000

In [None]:
points = np.random.rand(npoints, ndim)

In [None]:
points[0:2, :]

array([[0.35557206, 0.8870308 ],
       [0.26132088, 0.78257173]])

In [None]:
dfo = np.zeros((npoints, 1))  #distance from origin
outside_point = 0

In [None]:
%%time
for i in range(npoints):
  for j in range(ndim):
    dfo[i] += points[i, j] ** 2   #finding dis of every point from origin
  dfo[i] = np.sqrt(dfo[i])
  if (dfo[i] > 1):
    outside_point += 1

CPU times: user 1.37 s, sys: 131 ms, total: 1.5 s
Wall time: 1.38 s


In [None]:
print("Fraction of points outside is: ", outside_point/npoints)

Fraction of points outside is:  0.21293


In [None]:
# 1 - (pi/4)
1- 3.14/4

0.21499999999999997

In [None]:
%%time
sq_pts = points * points      #faster way --> NumPy
dfo = np.sum(sq_pts, axis= 1)
out_pts = np.sum(dfo > 1)

CPU times: user 3.76 ms, sys: 1.01 ms, total: 4.76 ms
Wall time: 7.41 ms


In [None]:
print("Fraction of points outside is: ", out_pts / npoints)

Fraction of points outside is:  0.21293


In [None]:
%%time
outside_points = np.sum(np.sqrt(np.sum(points * points, axis = 1) > 1))/npoints    #one line code for the same

CPU times: user 7.42 ms, sys: 118 µs, total: 7.54 ms
Wall time: 6.66 ms


In [None]:
def area_outside_circle (npoints, ndim):
  points = np.random.rand (npoints, ndim)
  return np.sum(np.sqrt(np.sum(points * points, axis = 1) > 1))/npoints

In [None]:
area_outside_circle(100000, 2)

0.21632

In [None]:
for i in range(2, 11):
  print(i, area_outside_circle(100000, i))
  #This tells for high dimensions, moslty all points are outside the circle
  #which means we need to think differently in ML/DL while working on high dimensions.

2 0.2136
3 0.47552
4 inf
5 inf
6 inf
7 inf
8 inf
9 inf
10 inf


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


##Broadcasting

NumPy checks the dimensions babckward (from x-axis). It sees if the dimensions of two tensors are same or not, if either there is a missing dimension or a particular dimension is one, then it replicate the same tensor to that dimension, so that operations can be performed. This is **Broadcasting**. So, broadcasting expansion can be done in more than one dimensions.

Eg. If we have two matrices: (4*1) + (1*5) = (4*5)

In [None]:
arr1 = np.arange(6)
arr1

array([0, 1, 2, 3, 4, 5])

In [None]:
arr1.shape

(6,)

In [None]:
arr1 = arr1.reshape((3, 2))
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [None]:
arr1.shape

(3, 2)

In [None]:
arr2 = np.arange(6).reshape((3, 2))
arr2

array([[0, 1],
       [2, 3],
       [4, 5]])

In [None]:
arr1 + arr2

array([[ 0,  2],
       [ 4,  6],
       [ 8, 10]])

In [None]:
arr2[0].reshape((1, 2))

array([[0, 1]])

In [None]:
arr2[0]

array([0, 1])

In [None]:
arr1 + arr2[0].reshape((1, 2))   #(3, 2) + (1, 2)

array([[0, 2],
       [2, 4],
       [4, 6]])

In [None]:
arr2[:, 0].reshape((3, 1))

array([[0],
       [2],
       [4]])

In [None]:
arr1 + arr2[:, 0].reshape((3, 1))     # (3, 2) + (3, 1)

array([[0, 1],
       [4, 5],
       [8, 9]])

In [None]:
arr1+ 1

array([[1, 2],
       [3, 4],
       [5, 6]])

In [None]:
arr1 = np.arange(24).reshape((2, 3, 4))
arr1

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [None]:
arr2 =  np.ones((1, 4))
arr2

array([[1., 1., 1., 1.]])

In [None]:
arr1 + arr2  #(2, 3, 4) + (1, 4)

array([[[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]],

       [[13., 14., 15., 16.],
        [17., 18., 19., 20.],
        [21., 22., 23., 24.]]])

In [None]:
arr1 = np.arange(4)
arr1

array([0, 1, 2, 3])

In [None]:
arr1.shape

(4,)

In [None]:
arr2 = np.arange(5)
arr2

array([0, 1, 2, 3, 4])

In [None]:
arr2.shape

(5,)

In [None]:
arr1 + arr2   #can't be broadcast

ValueError: ignored

In [None]:
arr1.reshape((4, 1)) + arr2      #(4, 1) + (5)

array([[0, 1, 2, 3, 4],
       [1, 2, 3, 4, 5],
       [2, 3, 4, 5, 6],
       [3, 4, 5, 6, 7]])

In [None]:
arr = np.random.rand(3, 3)
arr

array([[0.9490919 , 0.75639235, 0.6384193 ],
       [0.39477865, 0.7963173 , 0.67152948],
       [0.12203281, 0.80189301, 0.08262947]])

In [None]:
arr.T   #Taking transpose

array([[0.9490919 , 0.39477865, 0.12203281],
       [0.75639235, 0.7963173 , 0.80189301],
       [0.6384193 , 0.67152948, 0.08262947]])

##File Handling

In [None]:
planets_small = np.loadtxt("planets_small.txt", skiprows = 1, 
                           usecols = [1, 2, 3, 4, 5, 6, 7, 8, 9])    #numpy cant read strings

In [None]:
planets_small

array([[3.3000e-01, 4.8700e+00, 5.9700e+00, 6.4200e-01, 1.8980e+03,
        5.6800e+02, 8.6800e+01, 1.0200e+02, 1.4600e-02],
       [5.7900e+01, 1.0820e+02, 1.4960e+02, 2.2790e+02, 7.7860e+02,
        1.4335e+03, 2.8725e+03, 4.4951e+03, 5.9064e+03],
       [4.2226e+03, 2.8020e+03, 2.4000e+01, 2.4700e+01, 9.9000e+00,
        1.0700e+01, 1.7200e+01, 1.6100e+01, 1.5330e+02]])

In [None]:
planets_small.shape

(3, 9)

In [None]:
#planets = np.loadtxt("planets.txt", skiprows = 1, 
                           #usecols = [1, 2, 3, 4, 5, 6, 7, 8, 9])  #It has unknown values, so it wont work 

planets = np.genfromtxt("planets.txt", skip_header= 1, usecols = [1, 2, 3, 4, 5, 6, 7, 8, 9]) #It convert text value to "nan"

In [None]:
planets

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [None]:
planets.shape

(20, 9)

In [None]:
np.isnan(planets)

array([[False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, 

In [None]:
planets_new = np.nan_to_num(planets)
planets_new

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [None]:
np.savetxt('planets_new.txt', planets_new, delimiter= ',')  #a new file is created in human readable format

In [None]:
np.save("planets_new", planets_new)    #a new file is created in only computer readable format

In [None]:
!ls -lh     #to see the size of files, we see .npy file is of much smaller size than .txt file

total 24K
-rw-r--r-- 1 root root 1.6K Aug  5 12:14 planets_new.npy
-rw-r--r-- 1 root root 4.5K Aug  5 12:11 planets_new.txt
-rw-r--r-- 1 root root  254 Aug  5 11:51 planets_small.txt
-rw-r--r-- 1 root root 1.5K Aug  5 12:03 planets.txt
drwxr-xr-x 1 root root 4.0K Jul 16 13:20 sample_data


In [None]:
arr1 = np.random.rand(1000, 10)
arr2 = np.random.rand(2000, 5)
arr3 = np.random.rand(20, 10000)

In [None]:
np.savez("many_arrs",arr1, arr2, arr3)  #To save multiple ndarrays in a np file

In [None]:
!ls -lh

total 1.8M
-rw-r--r-- 1 root root 1.7M Aug  5 12:19 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Aug  5 12:14 planets_new.npy
-rw-r--r-- 1 root root 4.5K Aug  5 12:11 planets_new.txt
-rw-r--r-- 1 root root  254 Aug  5 11:51 planets_small.txt
-rw-r--r-- 1 root root 1.5K Aug  5 12:03 planets.txt
drwxr-xr-x 1 root root 4.0K Jul 16 13:20 sample_data


In [None]:
arrs = np.load("many_arrs.npz")

In [None]:
print(type(arrs))

<class 'numpy.lib.npyio.NpzFile'>


In [None]:
arrs.files

['arr_0', 'arr_1', 'arr_2']

In [None]:
arrs['arr_0'].shape

(1000, 10)

In [None]:
#compressing data
arr1 = np.zeros((10000, 10000))
np.savez("zeros", arr1)
np.savez_compressed('zeros_compressed', arr1)

In [None]:
!ls -lh

total 766M
-rw-r--r-- 1 root root 1.7M Aug  5 12:19 many_arrs.npz
-rw-r--r-- 1 root root 1.6K Aug  5 12:14 planets_new.npy
-rw-r--r-- 1 root root 4.5K Aug  5 12:11 planets_new.txt
-rw-r--r-- 1 root root  254 Aug  5 11:51 planets_small.txt
-rw-r--r-- 1 root root 1.5K Aug  5 12:03 planets.txt
drwxr-xr-x 1 root root 4.0K Jul 16 13:20 sample_data
-rw-r--r-- 1 root root 760K Aug  5 12:23 zeros_compressed.npz
-rw-r--r-- 1 root root 763M Aug  5 12:23 zeros.npz


##Stats with NumPy

In [None]:
arr = np.random.rand(100000,)   #uniformly distributed
#For normal distributed, use randn

In [None]:
np.amin(arr)

1.3911039276059967e-07

In [None]:
np.amax(arr)

0.9999959505563843

In [None]:
np.mean(arr)

0.5017275452699348

In [None]:
np.var(arr)

0.08323685181648972

In [None]:
np.std(arr)

0.2885079753082915

In [None]:
np.median(arr)

0.5026523762382655

In [None]:
np.percentile(arr, 50)

0.5026523762382655

In [None]:
np.percentile(arr, 10)  #closer to 10

0.10160578503072504

In [None]:
#inter-quartile range
iqr = np.percentile(arr, 75) - np.percentile(arr, 25)
iqr

0.5001794207194195

In [None]:
quartiles = np.percentile(arr, [25, 75])
quartiles

array([0.25182949, 0.75200891])

In [None]:
iqr = quartiles[1] - quartiles[0]
iqr

0.5001794207194195

In [None]:
z_scores = (arr - np.mean(arr))/np.std(arr)  #how left or right a value is from mean
z_scores

array([-0.31514832,  0.42672519,  0.87719473, ...,  0.29611161,
        1.51724158, -1.44356143])

In [None]:
np.histogram(arr)    #It will give 10 bins by default, and their ranges, which are almost equal
#first array gives num of points in each bin

(array([ 9979, 10010, 10067,  9936,  9998,  9878, 10118,  9996, 10000,
        10018]),
 array([2.53921680e-05, 1.00021316e-01, 2.00017240e-01, 3.00013163e-01,
        4.00009087e-01, 5.00005011e-01, 6.00000935e-01, 6.99996858e-01,
        7.99992782e-01, 8.99988706e-01, 9.99984630e-01]))

In [None]:
np.histogram(arr, bins=5)

(array([19989, 20003, 19876, 20114, 20018]),
 array([2.53921680e-05, 2.00017240e-01, 4.00009087e-01, 6.00000935e-01,
        7.99992782e-01, 9.99984630e-01]))

In [None]:
np.histogram(arr, bins=[0, 0.25, 0.5, 0.75, 1])

(array([24936, 25054, 24931, 25079]), array([0.  , 0.25, 0.5 , 0.75, 1.  ]))

In [None]:
bins = [0, 0.25, 0.5, 0.75, 1]
#it gives 4 bins of range
#(0-0.25) - bin 1
#(0.25-0.5) - bin 2
#(0.5-0.75) - bin 3
#(0.75-1) - bin 4

In [None]:
np.digitize(arr, bins) #to check in which bin a particular point will come

array([1, 2, 2, ..., 2, 2, 4])

In [None]:
arr1 = np.random.randint(0, 10, (10))
arr1

array([1, 5, 9, 9, 7, 9, 8, 7, 1, 7])

In [None]:
bins = [0, 7, 10]

In [None]:
np.digitize(arr1, bins)  #(0-7) bin will not contain 7 in it (like in range)

array([1, 1, 2, 2, 2, 2, 2, 2, 1, 2])

In [None]:
np.digitize(arr1, bins, right = True)  #It will include rirght point too, i.e., 7 here

array([1, 1, 2, 2, 1, 2, 2, 1, 1, 1])

In [None]:
arr1 = np.random.randint(50, 80, 100)  #create 100 ints between 50 and 80
arr1  #weight

array([53, 72, 68, 55, 52, 71, 52, 66, 65, 51, 61, 64, 51, 78, 58, 62, 71,
       64, 55, 72, 62, 72, 63, 78, 74, 77, 65, 75, 70, 76, 74, 75, 73, 71,
       59, 67, 55, 58, 76, 71, 67, 67, 74, 75, 57, 50, 76, 58, 65, 56, 62,
       75, 58, 61, 57, 75, 65, 60, 56, 70, 51, 69, 62, 79, 75, 53, 59, 75,
       51, 59, 59, 60, 76, 74, 51, 57, 55, 79, 76, 66, 74, 69, 56, 78, 68,
       55, 79, 67, 69, 73, 63, 69, 72, 57, 59, 74, 51, 57, 71, 58])

In [None]:
arr2 = np.random.randint(150, 185, 100)
arr2   #heights

array([164, 164, 155, 159, 184, 151, 174, 163, 153, 151, 181, 168, 175,
       168, 155, 172, 184, 182, 158, 162, 180, 164, 157, 180, 156, 152,
       166, 161, 168, 176, 181, 163, 177, 163, 161, 157, 174, 165, 165,
       180, 171, 171, 151, 156, 181, 177, 164, 181, 181, 172, 158, 177,
       159, 169, 150, 157, 162, 183, 170, 181, 172, 169, 166, 160, 175,
       161, 158, 177, 182, 150, 182, 166, 175, 163, 158, 164, 155, 182,
       171, 157, 166, 167, 154, 172, 172, 172, 182, 171, 152, 184, 178,
       159, 156, 166, 177, 178, 151, 159, 177, 156])

In [None]:
arr3 = np.random.randint(17, 22, 100)
arr3   #age

array([21, 17, 17, 19, 18, 20, 18, 21, 19, 17, 20, 19, 19, 18, 19, 20, 17,
       18, 21, 19, 17, 21, 21, 17, 21, 19, 18, 17, 19, 21, 20, 17, 18, 18,
       17, 17, 20, 19, 21, 21, 18, 18, 18, 19, 21, 19, 19, 21, 17, 21, 19,
       18, 20, 19, 20, 17, 19, 19, 19, 20, 19, 19, 20, 21, 17, 20, 20, 17,
       21, 18, 21, 17, 19, 20, 19, 20, 21, 18, 19, 19, 17, 19, 20, 19, 18,
       21, 21, 19, 17, 17, 20, 17, 17, 19, 20, 20, 17, 18, 19, 19])

In [None]:
np.concatenate((arr1, arr2, arr3)).shape  #it will stack all arrays horizontally

(300,)

In [None]:
np.vstack((arr1, arr2, arr3)).shape   #it will stack arrays vertically

(3, 100)

In [None]:
arr2d = np.vstack((arr1, arr2, arr3))

In [None]:
np.amin(arr2d, axis=1)    #to find min of all columns differently

array([ 50, 150,  17])

In [None]:
np.amax(arr2d, axis=1) 

array([ 79, 184,  21])

In [None]:
np.mean(arr2d, axis=1) 

array([ 65.21, 167.32,  18.96])

##Rules of Statistics

###Mean subtracted array has zero means

In [None]:
arr = np.random.rand(1000)

In [None]:
mean = np.mean(arr)

In [None]:
arr1 = arr - mean

In [None]:
np.mean(arr1)   #it will be zero

1.5543122344752193e-17

###Computing mean with smaller set of values

In [None]:
arr = np.random.rand(1000)

In [None]:
for k in range(1, 50):
  arr1 = arr[0:k]
  print(k, np.mean(arr1))   #mean near to 0.5

1 0.4867397933688532
2 0.5013749112340372
3 0.4784672252174231
4 0.5554358210692614
5 0.520359632270045
6 0.4975471071517734
7 0.49312110458547426
8 0.47743456235556925
9 0.4410861489418647
10 0.43880125763486316
11 0.4811686787455541
12 0.48455635931517477
13 0.45677490161510964
14 0.4256277136045612
15 0.41637570761583015
16 0.44682334924363254
17 0.4307759547007176
18 0.4416614826616449
19 0.43755176904778265
20 0.4650741555185588
21 0.4873969457411753
22 0.4719100696595273
23 0.46375091210546165
24 0.44554718166349555
25 0.4572339678088515
26 0.44339615943675936
27 0.42838413844473955
28 0.4241962687513824
29 0.4152745323461184
30 0.40541253923720616
31 0.4065561290038529
32 0.423496038415929
33 0.41607006326817014
34 0.4242051248693257
35 0.41338666827409803
36 0.4137244075640465
37 0.4249556656329141
38 0.4299975964091281
39 0.44328024051547404
40 0.4526040096221193
41 0.46489707829641536
42 0.4714313814040432
43 0.4616923259652535
44 0.46732328549313207
45 0.4698624928584901
46 

In [None]:
#cumulative sum
means = np.cumsum(arr)/np.arange(1,1001)
means[0:15]   #near to 0.5

array([0.48673979, 0.50137491, 0.47846723, 0.55543582, 0.52035963,
       0.49754711, 0.4931211 , 0.47743456, 0.44108615, 0.43880126,
       0.48116868, 0.48455636, 0.4567749 , 0.42562771, 0.41637571])

###Effect of outliers on mean and median

In [None]:
arr = np.random.randint(1, 100, 100)

In [None]:
np.mean(arr)

42.83

In [None]:
np.median(arr)

42.0

In [None]:
arr = np.append(arr, [1000, 2000])

In [None]:
arr.shape

(102,)

In [None]:
np.mean(arr)    #sensitive to outliers

71.40196078431373

In [None]:
np.median(arr)     #not sensitive of outliers

43.5

###Effect of scaling arrays on mean and median

In [None]:
arr = np.random.rand(100)

In [None]:
np.mean(arr)

0.5296047884818493

In [None]:
np.median(arr)

0.5773814739907634

In [None]:
arr1 = 2.5 * arr + 0.65   #random scaling and adding offset

In [None]:
print(np.mean(arr1), 2.5*np.mean(arr)+0.65)   
#checking the case if the scaled mean is similar to the mean of scaled array

1.9740119712046234 1.9740119712046234


In [None]:
print(np.median(arr1), 2.5*np.median(arr)+0.65)  #holds true for median too

2.0934536849769088 2.0934536849769088


In [None]:
print(np.var(arr1), 2.5*2.5*np.var(arr))  #holds true for variance too
#in this, scaled term is squared and offset term will not considered  

0.534128288548747 0.5341282885487473


In [None]:
print(np.std(arr1), 2.5*np.std(arr))  #holds true for std deviation too

0.7308408093071617 0.7308408093071618


In [None]:
arr1 = np.random.rand(100)
arr2 = np.random.rand(100)

In [None]:
print(np.mean(0.21 * arr1 - 0.75 *arr2), 0.21 * np.mean(arr1) - 0.75 * np.mean(arr2))

-0.26116904521199413 -0.261169045211994


#Case Study

In [None]:
!head cric_data-200320-181217.tsv

	Sachin Tendulkar	Rahul Dravid	India
0	100	78	342
1	11	62	191
2	8	85	252
3	71	24	307
4	104	17	229
5	18	104	246
6	8	76	226
7	86	74	288
8	12	60	216




1.   Find mean, median, IQR for Sachin, Rahul and India.
2.   Find the histogram of Sachin's scores with 10 bins.
3. Find mean of Sachin's scores grouped by 25 matches.
4. Find mean of Sachin's scores when he has score a  century.
5. Find mean for Sachin's score when Rahul has scored less than 10.
6. Find mean for Sachin's scores based on which quartile India's score falls in.
7. For every match find out who has scored more - Sachin or Rahul.
8. How many more runs does Sachin score on average after having scored x runs.
9. How many matches did Sachin take to score the  first 1000 runs, next 1000 runs,...



In [None]:
runs = np.loadtxt('cric_data-200320-181217.tsv', skiprows=1, usecols=[1,2,3])
runs

array([[100.,  78., 342.],
       [ 11.,  62., 191.],
       [  8.,  85., 252.],
       [ 71.,  24., 307.],
       [104.,  17., 229.],
       [ 18., 104., 246.],
       [  8.,  76., 226.],
       [ 86.,  74., 288.],
       [ 12.,  60., 216.],
       [ 85.,  12., 224.],
       [ 18.,  63., 161.],
       [  4., 107., 276.],
       [  7.,  76., 283.],
       [ 37.,   4., 297.],
       [ 14.,   5., 139.],
       [  0.,  33., 224.],
       [  4.,   7., 178.],
       [  0.,   0.,   0.],
       [ 21.,  36., 193.],
       [  1.,  66., 231.],
       [ 62.,   0., 134.],
       [  0., 123., 246.],
       [138.,  39., 299.],
       [ 38.,   9., 242.],
       [  2.,  11., 214.],
       [ 46.,  14., 152.],
       [ 65.,   0., 104.],
       [  0.,   0.,   4.],
       [ 39.,  26., 155.],
       [ 48.,   4., 168.],
       [141.,  48., 282.],
       [ 62.,   7., 228.],
       [ 12.,  73., 231.],
       [  1.,  86., 238.],
       [ 41.,  32., 255.],
       [ 11.,  82., 273.],
       [  3.,  25., 143.],
 

In [None]:
runs.shape

(225, 3)

In [None]:
#ques1
iqr = np.percentile(runs, 75, axis=0) - np.percentile(runs, 25, axis=0)
print(np.mean(runs, axis=0), np.median(runs, axis=0), iqr)

[ 39.87555556  32.06222222 220.79555556] [ 27.  22. 216.] [57. 46. 98.]


In [None]:
#ques1 - Way 2
def stats(col):
  print("Mean: ", np.mean(col))
  print("Median: ", np.median(col))
  print("IQR: ", np.percentile(col, 75) - np.percentile(col, 25))

In [None]:
sachin = runs[:, 0]
rahul = runs[:, 1]
india = runs[:, 2]

In [None]:
stats(sachin)

Mean:  39.87555555555556
Median:  27.0
IQR:  57.0


In [None]:
stats(rahul)

Mean:  32.062222222222225
Median:  22.0
IQR:  46.0


In [None]:
stats(india)

Mean:  220.79555555555555
Median:  216.0
IQR:  98.0


In [None]:
#ques2
np.histogram(runs[:,0])

(array([99, 36, 28, 16, 11, 17,  8,  8,  1,  1]),
 array([  0. ,  18.6,  37.2,  55.8,  74.4,  93. , 111.6, 130.2, 148.8,
        167.4, 186. ]))

In [None]:
#ques3
sac = runs[:,0]
mean_ = []
n = len(sac)
i = 0
while (i < n):
  mean_.append(np.mean(sac[i:i+25]))
  i += 25
mean_

[33.96, 49.4, 38.48, 40.16, 39.36, 38.2, 44.6, 39.52, 35.2]

In [None]:
#ques3 - Way 2
sachin25 = sac.reshape(9,25)
np.mean(sachin25, axis=1)

array([33.96, 49.4 , 38.48, 40.16, 39.36, 38.2 , 44.6 , 39.52, 35.2 ])

In [None]:
#ques4
century = sac[sac >= 100]
np.mean(century)
#print(century)

125.0

In [None]:
#ques5
rah10 = runs[runs[:, 1] < 10]
np.mean(rah10[:,0])

#another way:   np.mean(sachin[rahul<10])

40.74285714285714

In [None]:
#ques6
qrs = np.percentile(india, [25, 50, 75, 100])
qrs

array([175., 216., 273., 499.])

If 0 <= India < 175, Sachin's avg. ...

If 175 <= India < 216, Sachin's avg. ...

If 216 <= India < 273, Sachin's avg. ...

If India >= 273, Sachin's avg. ... 

In [None]:
india.shape

(225,)

In [None]:
qrs.shape

(4,)

In [None]:
qrs = qrs.reshape(4, 1)

In [None]:
indices = india < qrs

In [None]:
indices.shape    #4 rows with all India value telling true/false

(4, 225)

In [None]:
sachin[indices[0, :]]   #gives score of sachin in first quartile
#We need to find mean of scores in all quartiles

array([18., 14.,  0., 62., 46., 65.,  0., 39., 48.,  3., 11., 65., 27.,
       28.,  3.,  4., 15., 40.,  5.,  8., 89.,  0.,  0.,  1.,  0.,  0.,
       81., 13.,  2., 36., 12., 19.,  0.,  6., 35.,  0., 44.,  3., 47.,
       17., 35., 33.,  7.,  9.,  2., 11., 17.,  1., 10.,  0., 23.,  1.,
        2., 25.,  0.])

In [None]:
for i in range(4):
  print(i, np.mean(sachin[indices[i, :]]))

0 19.672727272727272
1 28.18018018018018
2 31.688622754491018
3 39.799107142857146


In [None]:
#ques7
np.where(sachin > rahul, 'Sachin', 'Rahul')

array(['Sachin', 'Rahul', 'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin',
       'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Rahul', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Sachin',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin',
       'Sachin', 'Rahul', 'Sachin', 'Sachin', '

In [None]:
#ques8
x_arr = np.arange(0, 101, 5)  #values of x for which we want to check

In [None]:
x_arr = x_arr.reshape(x_arr.shape[0], 1)

In [None]:
indices = (sachin >= x_arr) 

In [None]:
indices.shape

(21, 225)

In [None]:
sachin[indices[1, :]]  #run of Sachin > 5

array([100.,  11.,   8.,  71., 104.,  18.,   8.,  86.,  12.,  85.,  18.,
         7.,  37.,  14.,  21.,  62., 138.,  38.,  46.,  65.,  39.,  48.,
       141.,  62.,  12.,  41.,  11., 186.,  11.,  27.,  27.,  51.,  18.,
        32., 146.,   5.,  45., 141.,  12.,  65.,  27.,   7.,  16.,  28.,
         6., 123., 120.,   7.,  81.,  54., 122.,  14., 100.,  15.,  57.,
        99.,  37.,  38.,  32.,  21.,  32.,  40.,   5.,   8.,   5.,  50.,
        30.,  37.,  89.,  98.,  83.,  93.,  52., 152.,   8.,  93.,  45.,
        26.,  16.,  47.,  89.,  53.,  16.,  81.,  14.,  78.,   6., 105.,
       122.,   9.,   8.,  28.,  35.,  69.,  13.,  97.,  93.,  36.,  39.,
        29.,  12.,  19.,  34., 100.,  44.,  82.,  79.,   6.,   9.,   8.,
        23.,  93.,  35.,  63.,  74.,   8., 117.,  39.,  49.,  64.,  43.,
        72.,   5.,  17.,  65.,  20., 141.,  28.,  44.,  27.,  60.,  68.,
       139.,  31.,  44.,  47.,   6.,  17.,  35.,  88., 114.,   7., 127.,
        45.,  33., 110., 146.,   7.,  25.,   9.,  1

In [None]:
for i in range(x_arr.shape[0]):
  print(x_arr[i, 0], np.mean(sachin[indices[i, :]]) - x_arr[i, 0])   
  #subtracted x_arr[i] from mean to find MORE runs instead of TOTAL runs

0 39.87555555555556
5 45.61363636363637
10 47.48026315789474
15 47.45255474452555
20 46.824
25 44.10084033613445
30 45.13461538461539
35 43.24742268041237
40 44.05882352941177
45 43.41558441558442
50 43.98529411764706
55 42.317460317460316
60 38.67213114754098
65 37.654545454545456
70 37.08163265306122
75 34.347826086956516
80 30.75
85 28.650000000000006
90 27.400000000000006
95 26.433333333333337
100 25.0


In [None]:
#ques9
s_cumsum = np.cumsum(sachin)
s_cumsum

array([ 100.,  111.,  119.,  190.,  294.,  312.,  320.,  406.,  418.,
        503.,  521.,  525.,  532.,  569.,  583.,  583.,  587.,  587.,
        608.,  609.,  671.,  671.,  809.,  847.,  849.,  895.,  960.,
        960.,  999., 1047., 1188., 1250., 1262., 1263., 1304., 1315.,
       1318., 1504., 1515., 1542., 1569., 1620., 1638., 1670., 1816.,
       1821., 1866., 2007., 2019., 2084., 2111., 2118., 2134., 2136.,
       2164., 2170., 2293., 2413., 2420., 2423., 2423., 2504., 2506.,
       2560., 2682., 2686., 2700., 2700., 2800., 2815., 2815., 2872.,
       2971., 3008., 3046., 3078., 3099., 3131., 3171., 3171., 3176.,
       3184., 3189., 3189., 3239., 3269., 3306., 3395., 3399., 3497.,
       3580., 3673., 3673., 3725., 3877., 3878., 3886., 3979., 4024.,
       4050., 4050., 4051., 4051., 4067., 4114., 4203., 4206., 4207.,
       4260., 4276., 4276., 4357., 4371., 4449., 4455., 4560., 4682.,
       4691., 4699., 4727., 4762., 4831., 4844., 4941., 5034., 5036.,
       5072., 5111.,

In [None]:
np.histogram(s_cumsum, bins=np.arange(0, 10000, 1000))  #bins of 1000 range
#first array gives no of matches

(array([29, 18, 26, 25, 26, 26, 23, 22, 30]),
 array([   0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]))