# Numpy

Lists are slower while working as they store Hetrogeneous data so we use numpy arrays to process data faster.
<br>
Numpy stores n-d arrays in vectorised form to benefit form the DRAM locality.
Enable easy file save and load of n-d arrays.
<br>
Efficienty process without type-checking overhead and also enables other package to use numpy arrays as an efficient data interface.
<br>
Efficiently Broadcast operations across dimensions and provide implementations for many functions across linear algebra and statistics.

In [1]:
N = 10

In [2]:
%%time
list_ = list(range(N))
list_ = map(lambda x:x*x, list_)

Wall time: 0 ns


In [3]:
import numpy as np

In [4]:
%%time
arr = np.arange(N)
arr = arr * arr

Wall time: 0 ns


In [5]:
%%time
arr = np.arange(N)
arr = np.sum(arr)

Wall time: 0 ns


## Creating np arrays

In [6]:
arr = np.arange(5)
print(arr, type(arr))

[0 1 2 3 4] <class 'numpy.ndarray'>


In [7]:
arr = np.array([0,2,4,6])
print(arr, type(arr))

[0 2 4 6] <class 'numpy.ndarray'>


In [8]:
arr.dtype # data type of individual element of array

dtype('int32')

In [9]:
arr.ndim # no. of dimentions in an array

1

In [10]:
arr.shape # shape of the array

(4,)

In [11]:
arr.size # size of array

4

In [12]:
arr.itemsize # size required to store each item of an array

4

In [13]:
arr2d = np.array([[2,3,4],[1.2,3,4]])

In [14]:
arr2d

array([[2. , 3. , 4. ],
       [1.2, 3. , 4. ]])

In [15]:
arr2d.dtype

dtype('float64')

In [16]:
arr2d.ndim


2

In [17]:
arr2d.shape

(2, 3)

In [18]:
arr2d.size

6

In [19]:
arr2d.itemsize

8

In [20]:
arr3d = np.array([
                [
                    [1,2,4],
                    [4,5,6]
                ],
                [
                    [2,6,7],
                    [3,6,7]
                ]]
)

In [21]:
arr3d.shape

(2, 2, 3)

In [22]:
arr3d.size

12

In [23]:
arr3d.ndim

3

In [24]:
arr3d.dtype

dtype('int32')

In [25]:
arr3d

array([[[1, 2, 4],
        [4, 5, 6]],

       [[2, 6, 7],
        [3, 6, 7]]])

In [26]:
np.ones((2,3,4)) # To create array containing one's

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]])

In [27]:
12*np.ones((2,3,4)) # 12 scale is broadcasted to each element of array

array([[[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [12., 12., 12., 12.]],

       [[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [12., 12., 12., 12.]]])

In [28]:
np.zeros((4,2)) # To create array containing zero's (shape as argument)

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [29]:
np.random.randn(3,2,3)  # .randn() is used to give a series of random standardise numbers(normalised) with mean = 0 and std. var = 1 (shape as argument)

array([[[ 1.01796272, -0.63272476, -1.30131685],
        [-1.23073509,  1.31036729, -1.01147876]],

       [[-0.13007568, -1.76864223,  0.47384567],
        [ 0.67863252, -0.7897129 , -0.04968516]],

       [[ 1.46117004,  0.94211513, -0.64250905],
        [-0.80112449,  1.45494943,  0.39480778]]])

In [30]:
np.random.rand(10,2) # .rand() to give a series of numbers b/w 0 and 1 (shape as argument)

array([[0.92700568, 0.86350683],
       [0.41185958, 0.83868222],
       [0.22311033, 0.99091846],
       [0.05810124, 0.80645077],
       [0.6778284 , 0.96214919],
       [0.40506094, 0.40914725],
       [0.26155866, 0.75806037],
       [0.52233395, 0.48020457],
       [0.13892393, 0.26628097],
       [0.66680441, 0.7416787 ]])

In [31]:
np.random.randint(0, 100, (2,3)) # argument(min, max, shape)

array([[ 0, 40,  9],
       [13,  5,  1]])

In [32]:
np.arange(7,71,7)

array([ 7, 14, 21, 28, 35, 42, 49, 56, 63, 70])

In [33]:
np.linspace(7,70,10) # (last argument is number of vales between [7,70])

array([ 7., 14., 21., 28., 35., 42., 49., 56., 63., 70.])

In [34]:
str_arr = np.array(['1.2','2.3','3.4'])
print(str_arr, str_arr.dtype)

['1.2' '2.3' '3.4'] <U3


In [35]:
arr = np.array(str_arr, dtype = 'float')
print(arr, arr.dtype)
arr

[1.2 2.3 3.4] float64


array([1.2, 2.3, 3.4])

## Indexing

In [36]:
arr3d = np.array([
                [
                    [1,2,4],
                    [4,5,6]
                ],
                [
                    [2,6,7],
                    [3,6,7]
                ]]
)
arr3d

array([[[1, 2, 4],
        [4, 5, 6]],

       [[2, 6, 7],
        [3, 6, 7]]])

In [37]:
arr3d[0,0,0]

1

In [38]:
arr3d[0,:,:]

array([[1, 2, 4],
       [4, 5, 6]])

In [39]:
arr3d[:,:,1:3]

array([[[2, 4],
        [5, 6]],

       [[6, 7],
        [6, 7]]])

In [40]:
arr3d % 2 == 0 # broadcasting one operation to every element of array

array([[[False,  True,  True],
        [ True, False,  True]],

       [[ True,  True, False],
        [False,  True, False]]])

In [41]:
arr3d[arr3d % 2 == 0] # To print all the values which are even

array([2, 4, 4, 6, 2, 6, 6])

In [42]:
arr3d[arr3d == 2]

array([2, 2])

In [43]:
arr3d[(arr3d % 2 == 0) & (arr3d > 2)]

array([4, 4, 6, 6, 6])

### Copy array

In [44]:
arr_slice = arr3d[:,:,2]
arr_slice

array([[4, 6],
       [7, 7]])

In [45]:
arr_slice[0,0] = 1729
arr_slice

array([[1729,    6],
       [   7,    7]])

In [46]:
arr3d

array([[[   1,    2, 1729],
        [   4,    5,    6]],

       [[   2,    6,    7],
        [   3,    6,    7]]])

### Shallow Copy

In [47]:
arr_2 = np.copy(arr3d[:,:,1])
arr_2

array([[2, 5],
       [6, 6]])

In [48]:
arr_2[1,1] = 1234
arr_2

array([[   2,    5],
       [   6, 1234]])

In [49]:
arr3d

array([[[   1,    2, 1729],
        [   4,    5,    6]],

       [[   2,    6,    7],
        [   3,    6,    7]]])

### Indexing continued 

In [50]:
arr = np.random.randint(0,10,(5)) #arguments min, max, size as a tuple
arr

array([1, 6, 8, 2, 5])

In [51]:
my_indices = [1,2,3,4]
arr[my_indices]

array([6, 8, 2, 5])

## Numpy Operations

In [52]:
arr1 = np.random.rand(3,4) # size not as a tuple
arr2 = np.random.rand(3,4)
print(arr1)
print(arr2)

[[0.65223973 0.05348373 0.27258263 0.69524148]
 [0.0325733  0.09549388 0.33414554 0.26607447]
 [0.92324627 0.85810328 0.8977601  0.64173859]]
[[0.33570252 0.74933449 0.20017982 0.9942403 ]
 [0.09179656 0.8208057  0.54735329 0.21942041]
 [0.72712977 0.57234937 0.08849597 0.43876362]]


In [53]:
arr1 + arr2

array([[0.98794226, 0.80281822, 0.47276245, 1.68948178],
       [0.12436986, 0.91629958, 0.88149883, 0.48549488],
       [1.65037604, 1.43045265, 0.98625607, 1.08050221]])

In [54]:
arr1 - arr2

array([[ 0.31653721, -0.69585076,  0.0724028 , -0.29899882],
       [-0.05922326, -0.72531182, -0.21320775,  0.04665407],
       [ 0.1961165 ,  0.28575391,  0.80926412,  0.20297498]])

In [55]:
arr1 * arr2

array([[0.21895852, 0.0400772 , 0.05456554, 0.6912371 ],
       [0.00299012, 0.07838192, 0.18289566, 0.05838217],
       [0.67131985, 0.49113487, 0.07944815, 0.28157155]])

In [56]:
arr1 / arr2

array([[ 1.94290982,  0.07137497,  1.36168882,  0.69926906],
       [ 0.35484229,  0.11634164,  0.61047507,  1.21262411],
       [ 1.26971321,  1.49926482, 10.14464331,  1.46260667]])

In [57]:
arr1 % arr2

array([[0.31653721, 0.05348373, 0.0724028 , 0.69524148],
       [0.0325733 , 0.09549388, 0.33414554, 0.04665407],
       [0.1961165 , 0.28575391, 0.01280035, 0.20297498]])

In [58]:
np.exp(arr1) # Exponentiate each value of array

array([[1.91983593, 1.05493983, 1.31335198, 2.00419299],
       [1.03310962, 1.10020209, 1.39674641, 1.30483223],
       [2.51744946, 2.35868269, 2.4541    , 1.89978096]])

In [59]:
np.log(arr1) # log of each value

array([[-0.4273431 , -2.92837782, -1.29981348, -0.36349604],
       [-3.42426229, -2.34869314, -1.09617864, -1.32397903],
       [-0.07985926, -0.15303081, -0.1078524 , -0.44357423]])

In [60]:
np.sin(arr1)

array([[0.6069679 , 0.05345823, 0.26921961, 0.64057089],
       [0.03256754, 0.09534881, 0.32796209, 0.26294608],
       [0.79756408, 0.75660371, 0.7819326 , 0.59858906]])

In [61]:
np.cos(arr1)

array([[0.79472635, 0.99857009, 0.96307881, 0.76789904],
       [0.99946954, 0.99544392, 0.94469089, 0.96481053],
       [0.60323423, 0.65387371, 0.62336298, 0.80105627]])

In [62]:
np.sqrt(arr1)

array([[0.8076136 , 0.23126549, 0.52209446, 0.83381142],
       [0.18048075, 0.30902084, 0.57805323, 0.51582407],
       [0.96085705, 0.92633864, 0.94750203, 0.80108588]])

In [63]:
arr3 = np.zeros((3,4))

In [64]:
arr_inv = 1 / arr3

  """Entry point for launching an IPython kernel.


In [65]:
print(arr_inv)  #inf is infinity

[[inf inf inf inf]
 [inf inf inf inf]
 [inf inf inf inf]]


In [66]:
np.isinf(arr_inv)

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

## Exercise on finding number of points outside n-dimentional sphere 

In [67]:
ndim = 2

In [68]:
npoints = 1000000

In [69]:
points = np.random.rand(npoints, ndim)
points

array([[0.58132103, 0.56595876],
       [0.46715511, 0.95498568],
       [0.02392364, 0.10110189],
       ...,
       [0.34395143, 0.11979308],
       [0.15127111, 0.65683923],
       [0.31189878, 0.5202473 ]])

In [70]:
dfo = np.zeros((npoints, 1))
dfo

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [71]:
%%time
outside_points = 0
for i in range(npoints):
    for j in range(ndim):
        dfo[i] += points[i,j] ** 2
    dfo[i] = np.sqrt(dfo[i])
    if dfo[i] > 1:
        outside_points += 1

print("Fraction of points outside: ", outside_points/npoints)
        

Fraction of points outside:  0.214723
Wall time: 11.6 s


In [72]:
%%time
sq_points = points * points
dfo1 = np.sqrt(np.sum(sq_points, axis = 1))
outside = np.sum(dfo1 > 1)


Wall time: 71.8 ms


In [73]:
print(outside/npoints)

0.214723


In [74]:
%%time
out = np.sum(np.sqrt(np.sum(points * points, axis = 1))>1)/npoints
out

Wall time: 62.8 ms


0.214723

## Broadcasting

In [75]:
arr1 = np.array(range(6))

In [76]:
arr1

array([0, 1, 2, 3, 4, 5])

In [77]:
arr1 = arr1.reshape((3,2))
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [78]:
arr2 = np.array(range(6)).reshape((3,2))
arr2

array([[0, 1],
       [2, 3],
       [4, 5]])

In [79]:
arr2[0].reshape((1,2))

array([[0, 1]])

In [80]:
arr1 + arr2[0].reshape((1,2))

array([[0, 2],
       [2, 4],
       [4, 6]])

In [81]:
arr1 + arr2[0]

array([[0, 2],
       [2, 4],
       [4, 6]])

In [82]:
(arr1 + arr2[0]).T # Transpose

array([[0, 2, 4],
       [2, 4, 6]])

## File Handling

In [83]:
# "G:\Work\DS\Data Files\planets_small.txt"
file = open("planets_small.txt", "r") 


In [84]:
file.readlines()

['\t\t\tMERCURY VENUS   EARTH   MARS    JUPITER SATURN \tURANUS  NEPTUNE PLUTO \n',
 'Mass\t\t0.330\t4.87\t5.97\t0.642\t1898\t568\t\t86.8\t102\t\t0.0146\n',
 'Diameter\t57.9\t108.2\t149.6\t227.9\t778.6\t1433.5\t2872.5\t4495.1\t5906.4\n',
 'DayLength\t4222.6\t2802.0\t24.0\t24.7\t9.9\t\t10.7\t17.2\t16.1\t153.3']

In [85]:
planets_small = np.loadtxt("planets_small.txt")

ValueError: could not convert string to float: 'MERCURY'

In [86]:
planets_small = np.loadtxt("planets_small.txt", skiprows = 1)

ValueError: could not convert string to float: 'Mass'

In [87]:
planets_small = np.loadtxt("planets_small.txt", skiprows = 1, usecols=(1,2,3,4,5,6,7,8,9))

In [88]:
planets_small

array([[3.3000e-01, 4.8700e+00, 5.9700e+00, 6.4200e-01, 1.8980e+03,
        5.6800e+02, 8.6800e+01, 1.0200e+02, 1.4600e-02],
       [5.7900e+01, 1.0820e+02, 1.4960e+02, 2.2790e+02, 7.7860e+02,
        1.4335e+03, 2.8725e+03, 4.4951e+03, 5.9064e+03],
       [4.2226e+03, 2.8020e+03, 2.4000e+01, 2.4700e+01, 9.9000e+00,
        1.0700e+01, 1.7200e+01, 1.6100e+01, 1.5330e+02]])

In [89]:
planets_small.shape

(3, 9)

In [90]:
planets_small = np.loadtxt("planets.txt", skiprows = 1,usecols=(1,2,3,4,5,6,7,8,9))

ValueError: could not convert string to float: 'Unknown'

In [91]:
planets = np.genfromtxt("planets.txt", skip_header = 1, usecols=[1,2,3,4,5,6,7,8,9]) # Generate file from text

In [92]:
planets

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [93]:
planets.shape

(20, 9)

In [94]:
np.isnan(planets)

array([[False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, 

In [95]:
planets_new = np.nan_to_num(planets, -1)

In [96]:
planets_new

array([[ 3.30000e-01,  4.87000e+00,  5.97000e+00,  7.30000e-02,
         6.42000e-01,  1.89800e+03,  5.68000e+02,  8.68000e+01,
         1.02000e+02],
       [ 4.87900e+03,  1.21040e+04,  1.27560e+04,  3.47500e+03,
         6.79200e+03,  1.42984e+05,  1.20536e+05,  5.11180e+04,
         4.95280e+04],
       [ 5.42700e+03,  5.24300e+03,  5.51400e+03,  3.34000e+03,
         3.93300e+03,  1.32600e+03,  6.87000e+02,  1.27100e+03,
         1.63800e+03],
       [ 3.70000e+00,  8.90000e+00,  9.80000e+00,  1.60000e+00,
         3.70000e+00,  2.31000e+01,  9.00000e+00,  8.70000e+00,
         1.10000e+01],
       [ 4.30000e+00,  1.04000e+01,  1.12000e+01,  2.40000e+00,
         5.00000e+00,  5.95000e+01,  3.55000e+01,  2.13000e+01,
         2.35000e+01],
       [ 1.40760e+03, -5.83250e+03,  2.39000e+01,  6.55700e+02,
         2.46000e+01,  9.90000e+00,  1.07000e+01, -1.72000e+01,
         1.61000e+01],
       [ 4.22260e+03,  2.80200e+03,  2.40000e+01,  7.08700e+02,
         2.47000e+01,  9.90000

In [97]:
np.isnan(planets_new)

array([[False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, 

In [98]:
np.savetxt('planets_new.txt',planets_new,delimiter=',') #saves in user readable format

In [99]:
np.save('planets_new2',planets_new) # saves in numpy readable format(binary file)

In [100]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [101]:
arr1 = np.random.rand(1000,10)
arr2 = np.random.rand(2000,5)
arr3 = np.random.rand(100,1011)

In [102]:
np.savez('many_arr', arr1, arr2, arr3)

In [103]:
arr = np.load("many_arr.npz")

In [104]:
print(type(arr))

<class 'numpy.lib.npyio.NpzFile'>


In [105]:
arr.files

['arr_0', 'arr_1', 'arr_2']

In [106]:
arr['arr_0'].shape

(1000, 10)

In [107]:
np.savez_compressed("many_array_compressed",arr1,arr2,arr3)

## Stats with NumPy

In [108]:
arr = np.random.rand(100000,)

In [109]:
arr.min

<function ndarray.min>

In [110]:
np.amin(arr) #minimun

8.589898348176916e-07

In [111]:
np.amax(arr) #maximum

0.9999831515890059

In [112]:
np.mean(arr)  #mean

0.4997394264573581

In [113]:
np.var(arr)  #variance

0.08330159399193664

In [114]:
np.std(arr)   #std. deviation

0.2886201552073878

In [115]:
np.median(arr) #median

0.5003580107060761

In [116]:
np.percentile(arr,75) #75th percentile

0.74943165053862

In [117]:
np.mode(arr)

AttributeError: module 'numpy' has no attribute 'mode'

In [118]:
from scipy import stats
stats.mode(arr)  #mode

ModeResult(mode=array([8.58989835e-07]), count=array([1]))

In [119]:
iqr = np.percentile(arr,75) - np.percentile(arr,25) #inter quartile range
iqr

0.49959179908012585

In [120]:
quartiles = np.percentile(arr, [25,50,75,100])
quartiles[2]-quartiles[0]

0.49959179908012585

In [121]:
(arr - np.mean(arr))/np.std(arr) #z_score

array([ 1.41022185, -1.69216492, -0.34405108, ..., -0.07865839,
        1.35570062, -0.37133996])

In [122]:
np.histogram(arr) #histogram

(array([10027, 10087,  9882,  9999,  9964, 10033, 10172,  9873, 10015,
         9948], dtype=int64),
 array([8.58989835e-07, 9.99990882e-02, 1.99997318e-01, 2.99995547e-01,
        3.99993776e-01, 4.99992005e-01, 5.99990235e-01, 6.99988464e-01,
        7.99986693e-01, 8.99984922e-01, 9.99983152e-01]))

In [123]:
np.histogram(arr, bins = 5)

(array([20114, 19881, 19997, 20045, 19963], dtype=int64),
 array([8.58989835e-07, 1.99997318e-01, 3.99993776e-01, 5.99990235e-01,
        7.99986693e-01, 9.99983152e-01]))

In [124]:
np.histogram(arr, bins = [0, 0.25,0.5,0.75,1])

(array([25013, 24947, 25112, 24928], dtype=int64),
 array([0.  , 0.25, 0.5 , 0.75, 1.  ]))

In [125]:
bins = [0, 0.25,0.5,0.75,1] 
np.digitize(arr,bins)      # mark the element that in which bin it falls

array([4, 1, 2, ..., 2, 4, 2], dtype=int64)

In [126]:
arr1 = np.random.randint(50,100,100) # weight
arr2 = np.random.randint(150,185,100) # height
arr3 = np.random.randint(17,22,100) # age

In [127]:
np.concatenate((arr1,arr2,arr3))

array([ 63,  77,  98,  60,  99,  51,  88,  91,  63,  94,  96,  81,  60,
        72,  69,  53,  57,  78,  69,  52,  98,  89,  85,  90,  53,  57,
        96,  51,  74,  61,  50,  67,  84,  56,  93,  58,  72,  78,  86,
        74,  70,  63,  74,  86,  77,  82,  94,  97,  65,  57,  85,  89,
        79,  53,  56,  50,  73,  65,  72,  83,  51,  52,  58,  90,  60,
        51,  57,  72,  64,  92,  53,  60,  59,  92,  71,  71,  86,  51,
        81,  62,  91,  53,  52,  65,  66,  88,  96,  57,  76,  60,  95,
        88,  68,  98,  61,  60,  61,  88,  78,  94, 168, 164, 178, 156,
       174, 152, 158, 152, 162, 167, 174, 171, 161, 170, 182, 156, 157,
       184, 178, 159, 173, 169, 176, 169, 164, 152, 153, 156, 157, 155,
       161, 181, 155, 167, 153, 176, 164, 163, 170, 162, 152, 179, 150,
       170, 162, 176, 171, 181, 177, 158, 161, 167, 151, 171, 168, 159,
       170, 171, 177, 165, 161, 170, 174, 175, 151, 169, 166, 176, 184,
       171, 161, 184, 162, 166, 153, 175, 169, 183, 172, 154, 16

In [128]:
np.concatenate((arr1,arr2,arr3)).shape

(300,)

In [129]:
arr2d = np.vstack((arr1,arr2,arr3))#vertical stack
arr2d

array([[ 63,  77,  98,  60,  99,  51,  88,  91,  63,  94,  96,  81,  60,
         72,  69,  53,  57,  78,  69,  52,  98,  89,  85,  90,  53,  57,
         96,  51,  74,  61,  50,  67,  84,  56,  93,  58,  72,  78,  86,
         74,  70,  63,  74,  86,  77,  82,  94,  97,  65,  57,  85,  89,
         79,  53,  56,  50,  73,  65,  72,  83,  51,  52,  58,  90,  60,
         51,  57,  72,  64,  92,  53,  60,  59,  92,  71,  71,  86,  51,
         81,  62,  91,  53,  52,  65,  66,  88,  96,  57,  76,  60,  95,
         88,  68,  98,  61,  60,  61,  88,  78,  94],
       [168, 164, 178, 156, 174, 152, 158, 152, 162, 167, 174, 171, 161,
        170, 182, 156, 157, 184, 178, 159, 173, 169, 176, 169, 164, 152,
        153, 156, 157, 155, 161, 181, 155, 167, 153, 176, 164, 163, 170,
        162, 152, 179, 150, 170, 162, 176, 171, 181, 177, 158, 161, 167,
        151, 171, 168, 159, 170, 171, 177, 165, 161, 170, 174, 175, 151,
        169, 166, 176, 184, 171, 161, 184, 162, 166, 153, 175, 169, 18

In [130]:
np.amin(arr2d,axis=1) #minimum across columns

array([ 50, 150,  17])

In [131]:
np.cumsum(arr2d) #cumulative sum

array([   63,   140,   238,   298,   397,   448,   536,   627,   690,
         784,   880,   961,  1021,  1093,  1162,  1215,  1272,  1350,
        1419,  1471,  1569,  1658,  1743,  1833,  1886,  1943,  2039,
        2090,  2164,  2225,  2275,  2342,  2426,  2482,  2575,  2633,
        2705,  2783,  2869,  2943,  3013,  3076,  3150,  3236,  3313,
        3395,  3489,  3586,  3651,  3708,  3793,  3882,  3961,  4014,
        4070,  4120,  4193,  4258,  4330,  4413,  4464,  4516,  4574,
        4664,  4724,  4775,  4832,  4904,  4968,  5060,  5113,  5173,
        5232,  5324,  5395,  5466,  5552,  5603,  5684,  5746,  5837,
        5890,  5942,  6007,  6073,  6161,  6257,  6314,  6390,  6450,
        6545,  6633,  6701,  6799,  6860,  6920,  6981,  7069,  7147,
        7241,  7409,  7573,  7751,  7907,  8081,  8233,  8391,  8543,
        8705,  8872,  9046,  9217,  9378,  9548,  9730,  9886, 10043,
       10227, 10405, 10564, 10737, 10906, 11082, 11251, 11415, 11567,
       11720, 11876,

# Case Study

In [132]:
#tsv = tab seperated value


In [133]:
# data = np.loadtxt("G:\Work\DS\Data Files\cric_data-200320-181217.tsv",skiprows = 1)
# data = data[:,[1,2,3]]
data = np.genfromtxt(fname="cric_data-200320-181217.tsv", delimiter="\t", skip_header=1,usecols=[1,2,3])
data.shape


(225, 3)

In [134]:
print("Mean")
print("Sachin"+"\t"+" Rahul"+"\t"+" India")
# np.mean(data,axis=0)
print(round(np.mean(data[:,0]),2),"\t",round(np.mean(data[:,1]),2),"\t",round(np.mean(data[:,2]),2))
np.mean(data,axis=0)

Mean
Sachin	 Rahul	 India
39.88 	 32.06 	 220.8


array([ 39.87555556,  32.06222222, 220.79555556])

In [135]:
print("Median")
print("Sachin"+"\t"+" Rahul"+"\t"+" India")
# np.median(data,axis=0)
print(round(np.median(data[:,0]),2),"\t",round(np.median(data[:,1]),2),"\t",round(np.median(data[:,2]),2))

Median
Sachin	 Rahul	 India
27.0 	 22.0 	 216.0


In [136]:
print("IQR")
print("Sachin"+"\t"+" Rahul"+"\t"+" India")
print(np.percentile(data[:,0],75)-np.percentile(data[:,0],25),"\t",np.percentile(data[:,1],75)-np.percentile(data[:,1],25),"\t",np.percentile(data[:,2],75)-np.percentile(data[:,2],25))

IQR
Sachin	 Rahul	 India
57.0 	 46.0 	 98.0


In [137]:
print("Histogram: Sachin")
print(np.histogram(data[:,0],bins= 10))

Histogram: Sachin
(array([99, 36, 28, 16, 11, 17,  8,  8,  1,  1], dtype=int64), array([  0. ,  18.6,  37.2,  55.8,  74.4,  93. , 111.6, 130.2, 148.8,
       167.4, 186. ]))


In [138]:
print("Mean of every 25 matches played by Sachin")
sachin = data[:,0]
sachin_new = sachin.reshape(9,25)
np.mean(sachin_new,axis=1)

Mean of every 25 matches played by Sachin


array([33.96, 49.4 , 38.48, 40.16, 39.36, 38.2 , 44.6 , 39.52, 35.2 ])

In [139]:
print("Mean of all matches where Sachin scored a century")
sachin = data[:,0]
print(np.mean(sachin[sachin>=100]))

Mean of all matches where Sachin scored a century
125.0


In [140]:
print("Mean of Sachin's score where Rahul has scored less than 10")
Rahul = data[:,1]

print(np.mean(sachin[Rahul<=10]))

Mean of Sachin's score where Rahul has scored less than 10
40.2112676056338


In [141]:
print("Mean of Sachin's score in every Quartile on Indian Team")
arr = np.percentile(data[:,2],[25,50,75,100])
q1 = data[data[:,2]<arr[0]]
print("Q1: ",np.mean(q1[:,0]))
q2 = data[data[:,2]<arr[1]]
print("Q2: ",np.mean(q2[:,0]))
q3 = data[data[:,2]<arr[2]]
print("Q3: ",np.mean(q3[:,0]))
q4 = data[data[:,2]<arr[3]]
print("Q4: ",np.mean(q4[:,0]))
arr

Mean of Sachin's score in every Quartile on Indian Team
Q1:  19.672727272727272
Q2:  28.18018018018018
Q3:  31.688622754491018
Q4:  39.799107142857146


array([175., 216., 273., 499.])

In [142]:
print("Player with highest Score:")
snr = data[:,0:2]
is_rahul_higher = np.argmax(snr, axis = 1)
np.where(is_rahul_higher==0,'Sachin','Rahul')


Player with highest Score:


array(['Sachin', 'Rahul', 'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin',
       'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Sachin',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin',
       'Rahul', 'Rahul', 'Sachin', 'Rahul', 'Sachin', 'Sachin', 'Sachin',
       'Sachin', 'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Sachin', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Rahul', 'Rahul', 'Sachin', 'Rahul',
       'Rahul', 'Sachin', 'Rahul', 'Rahul', 'Rahul', 'Rahul', 'Rahul',
       'Rahul', 'Sachin', 'Sachin', 'Sachin', 'Rahul', 'Sachin', 'Sachin',
       'Sachin', 'Sachin', 'Sachin', 'Sachin'

In [143]:
print("How many more runs did Sachin score on average after having scored x runs.")
x_arr = np.arange(0,101,5)
x_arr = x_arr.reshape(x_arr.shape[0],1)
print(x_arr.shape)

How many more runs did Sachin score on average after having scored x runs.
(21, 1)


In [144]:
indices = (sachin >= x_arr)
indices.shape

(21, 225)

In [145]:
sachin[indices[1,:]]

array([100.,  11.,   8.,  71., 104.,  18.,   8.,  86.,  12.,  85.,  18.,
         7.,  37.,  14.,  21.,  62., 138.,  38.,  46.,  65.,  39.,  48.,
       141.,  62.,  12.,  41.,  11., 186.,  11.,  27.,  27.,  51.,  18.,
        32., 146.,   5.,  45., 141.,  12.,  65.,  27.,   7.,  16.,  28.,
         6., 123., 120.,   7.,  81.,  54., 122.,  14., 100.,  15.,  57.,
        99.,  37.,  38.,  32.,  21.,  32.,  40.,   5.,   8.,   5.,  50.,
        30.,  37.,  89.,  98.,  83.,  93.,  52., 152.,   8.,  93.,  45.,
        26.,  16.,  47.,  89.,  53.,  16.,  81.,  14.,  78.,   6., 105.,
       122.,   9.,   8.,  28.,  35.,  69.,  13.,  97.,  93.,  36.,  39.,
        29.,  12.,  19.,  34., 100.,  44.,  82.,  79.,   6.,   9.,   8.,
        23.,  93.,  35.,  63.,  74.,   8., 117.,  39.,  49.,  64.,  43.,
        72.,   5.,  17.,  65.,  20., 141.,  28.,  44.,  27.,  60.,  68.,
       139.,  31.,  44.,  47.,   6.,  17.,  35.,  88., 114.,   7., 127.,
        45.,  33., 110., 146.,   7.,  25.,   9.,  1

In [146]:
for i in range(x_arr.shape[0]):
    print(x_arr[i,0],np.mean(sachin[indices[i,:]])-x_arr[i,0])

0 39.87555555555556
5 45.61363636363637
10 47.48026315789474
15 47.45255474452555
20 46.824
25 44.10084033613445
30 45.13461538461539
35 43.24742268041237
40 44.05882352941177
45 43.41558441558442
50 43.98529411764706
55 42.317460317460316
60 38.67213114754098
65 37.654545454545456
70 37.08163265306122
75 34.347826086956516
80 30.75
85 28.650000000000006
90 27.400000000000006
95 26.433333333333337
100 25.0


no. of matches it took to complete 1000 runs and next 1000runs

In [147]:
sachin

array([100.,  11.,   8.,  71., 104.,  18.,   8.,  86.,  12.,  85.,  18.,
         4.,   7.,  37.,  14.,   0.,   4.,   0.,  21.,   1.,  62.,   0.,
       138.,  38.,   2.,  46.,  65.,   0.,  39.,  48., 141.,  62.,  12.,
         1.,  41.,  11.,   3., 186.,  11.,  27.,  27.,  51.,  18.,  32.,
       146.,   5.,  45., 141.,  12.,  65.,  27.,   7.,  16.,   2.,  28.,
         6., 123., 120.,   7.,   3.,   0.,  81.,   2.,  54., 122.,   4.,
        14.,   0., 100.,  15.,   0.,  57.,  99.,  37.,  38.,  32.,  21.,
        32.,  40.,   0.,   5.,   8.,   5.,   0.,  50.,  30.,  37.,  89.,
         4.,  98.,  83.,  93.,   0.,  52., 152.,   1.,   8.,  93.,  45.,
        26.,   0.,   1.,   0.,  16.,  47.,  89.,   3.,   1.,  53.,  16.,
         0.,  81.,  14.,  78.,   6., 105., 122.,   9.,   8.,  28.,  35.,
        69.,  13.,  97.,  93.,   2.,  36.,  39.,   2.,  29.,  12.,  19.,
        34.,   2., 100.,  44.,  82.,   0.,  79.,   6.,   9.,   8.,  23.,
        93.,  35.,  63.,  74.,   8., 117.,  39.,  4

In [148]:
sachin_cum_score=sachin.cumsum()
#np.cumsum(sachin)

In [149]:
np.histogram(sachin_cum_score, bins=np.arange(0,10000,1000))

(array([29, 18, 26, 25, 26, 26, 23, 22, 30], dtype=int64),
 array([   0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]))