## Introduction

As discussed in the last section, numpy provides a whole range of optimized built-in functions for various mathmatical operations and manipulations. 


### Basic descriptions of data
Numpy provides a number of functions to describe datasets in arrays. 

In [330]:
from numpy import *
CSV_FILE = "/home/asimbanskota/t81_577_data_science/weekly_materials/week6/files/city.csv"
data = genfromtxt(CSV_FILE,delimiter=',', skip_header = 1)


In [62]:
# shape of the data
data.shape

(128, 5)

In [153]:
print(mean(data))
# Ignore nan values while computing mean
print(nanmean(data))

nan
65.19010416666667


In [118]:
print('Mean of column zero: {}'.format(mean(data[:,1])))
print('Std deviation of column zero: {}'.format(std(data[:,1])))
print('Max value in the second column: {}'.format(mean(data[:,1])))
print('Min value in the second column: {}'.format(mean(data[:,1])))

Mean of column zero: 38.8203125
Std deviation of column zero: 5.1802412976949
Max value in the second column: 38.8203125
Min value in the second column: 38.8203125


In [125]:
set_printoptions(precision=2, suppress=True) 
std(data, axis = 0)

array([36.95,  5.18, 15.41,   nan,   nan])

In [128]:
mean(data[:,1:3], axis = 1);

In [27]:
print('Unique values in the first 5 rows of the second columns: {}'.format(unique(data[0:5,1])))

Unique values in the first 5 rows of the second columns: [41. 42. 43. 46.]


In [61]:
unique, counts = unique(data[0:5,1], return_counts=True)
dict(zip(unique, counts))

{41.0: 1, 42.0: 2, 43.0: 1, 46.0: 1}

In [88]:
data_sub = data[:,1] > 40
print(data_sub.all())
print(data_sub.any())

False
True


In [39]:
lst = [2,3,4,4,0,1,1,0,0,0,1,1,1,1,2,2,3]
print(bincount(lst))
# four zeroes, 6 ones, 

[4 6 3 2 2]


In [157]:
bins = array([30, 35, 40, 45, 50]) # increasing monotonically
N,bins = histogram(data[:,1],bins)
print(N, bins)

[24 36 48 14] [30 35 40 45 50]


### Array stacking  and splitting functions

In [312]:
x = data[2:4,1:3]
y = data[4:6,1:3]

In [323]:
x

array([[ 46., 120.],
       [ 42.,  71.]])

In [324]:
y

array([[43., 89.],
       [36., 80.]])

In [315]:
concatenate((x,y))

array([[ 46., 120.],
       [ 42.,  71.],
       [ 43.,  89.],
       [ 36.,  80.]])

In [326]:
vstack((x,y))

array([[ 46., 120.],
       [ 42.,  71.],
       [ 43.,  89.],
       [ 36.,  80.]])

In [325]:
hstack((x,y))

array([[ 46., 120.,  43.,  89.],
       [ 42.,  71.,  36.,  80.]])

In [319]:
split(data[0:10,0:3],2)

[array([[  0.,  41.,  80.],
        [  1.,  42.,  97.],
        [  2.,  46., 120.],
        [  3.,  42.,  71.],
        [  4.,  43.,  89.]]),
 array([[ 5., 36., 80.],
        [ 6., 49., 97.],
        [ 7., 39., 78.],
        [ 8., 34., 77.],
        [ 9., 39., 75.]])]

## Selecting data based upon conditions

In [80]:
# mask will create a boolean array of True and False
mask = data[:,1] > 47

In [75]:
data[mask]

array([[  6.,  49.,  97.,  nan,  nan],
       [ 10.,  48., 103.,  nan,  nan],
       [ 33.,  49., 123.,  nan,  nan],
       [124.,  50., 104.,  nan,  nan]])

In [103]:
# multiple conditions
mask = (x1 > 42 )& (x2 < 77)

In [100]:
data[mask]

array([[ 23.,  43.,  75.,  nan,  nan],
       [ 36.,  43.,  75.,  nan,  nan],
       [ 55.,  43.,  76.,  nan,  nan],
       [106.,  44.,  72.,  nan,  nan],
       [111.,  43.,  72.,  nan,  nan]])

In [101]:
# using all methods for multiple conditions
x1 = data[:,1]
x2 = data[:,2]
data[all([x1 > 42, x2 < 77], axis=0)]

array([[ 23.,  43.,  75.,  nan,  nan],
       [ 36.,  43.,  75.,  nan,  nan],
       [ 55.,  43.,  76.,  nan,  nan],
       [106.,  44.,  72.,  nan,  nan],
       [111.,  43.,  72.,  nan,  nan]])

Where function can be used to create index with True value

In [85]:
indices = where(mask)
indices

(array([  6,  10,  33, 124]),)

In [136]:
#select([data[:,1] >42, data[:,2] <= 77], [data[:,1]*1, data[:,2]*2])
#Return an array drawn from elements in choicelist, depending on conditions.

array([  0.,   0.,  46., 142.,  43.,   0.,  49.,   0., 154., 150.,  48.,
       154.,   0.,   0.,   0.,   0.,   0.,  47.,   0.,   0.,  44.,   0.,
        44.,  43.,   0., 146., 154.,   0.,  46.,   0.,   0.,   0.,   0.,
        49.,  46.,   0.,  43.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0., 148.,  44.,  43.,   0.,   0.,   0.,   0.,   0.,   0.,  47.,
        43.,   0.,   0., 150.,   0.,  44.,   0.,   0.,   0.,   0.,   0.,
       144.,   0.,  47.,   0.,  43.,   0.,   0.,   0.,  44.,   0.,   0.,
         0.,  47., 150.,   0., 146.,   0.,  46.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 150.,   0.,
         0.,   0.,  44.,  44.,   0.,   0.,   0.,  44.,  45.,   0.,  43.,
         0.,  43.,   0., 154.,   0.,   0.,  43.,  44.,   0., 154.,   0.,
         0.,  45.,   0.,  50.,   0., 150.,   0.])

### Basic mathmatical functions

In [106]:
sum(data[:,1])


4969.0

In [164]:
print(fix ([1.2, 3.6]))
print(round_([1.2, 3.6]))
print(ceil ([1.2, 3.6]))
print(floor([1.2, 3.6]))

[1. 3.]
[1. 4.]
[2. 4.]
[1. 3.]


In [166]:
cumsum([1,2,3,4])

array([ 1,  3,  6, 10])

In [170]:
print(add.accumulate([1,2,3,4]))
print(multiply.accumulate([1,2,3,4]))

[ 1  3  6 10]
[ 1  2  6 24]


In [105]:
sort(data[2:7,1])

array([36., 42., 43., 46., 49.])

## Functions from module random

Return random floats from the “continuous uniform” distribution over the interval.[0.0, 1.0)

In [186]:
random(5)

array([0.93, 0.42, 0.95, 0.81, 0.09])

Return random integers between 5 and 10 as an array of size 2 by2

In [328]:
import numpy as np
np.random.randint(5,10, (2,2))

array([[6, 9],
       [5, 6]])

In [183]:
from numpy.random import *
permutation(5) 

array([4, 3, 0, 1, 2])

In [182]:
arr = np.arange(10)
shuffle(arr)
print(arr)

[4 1 3 5 6 2 7 9 8 0]


In [147]:
a = permutation(4) 
for position,value in ndenumerate(a):
    print (position,value) 

(0,) 1
(1,) 0
(2,) 3
(3,) 2


Draw samples from a binomial distribution with specified parameters, n trials and p probability of success 

In [276]:
#n= 5, p = 0.2
binomial(30,0.2,size = 5)

array([ 4,  5, 11,  4,  6])

Draw samples from normal distribution with specified mean and standard deviation

In [279]:
# mean = 2, std = 0.2
normal(2, .2, size = 5)

array([2.03, 1.93, 2.03, 2.06, 2.  ])

## Matrix manipulation  functions



In [300]:
N = np.random.randint(5, size=(2, 4))
print(M)
print(N)

[[1 1 4 2]
 [4 3 4 1]]
[[3 1 4 3]
 [1 2 4 1]]


In [291]:
# Transpose of a matrix
M.T

array([[1, 4],
       [1, 3],
       [4, 4],
       [2, 1]])

In [303]:
# inverse of a matrix
np.linalg.inv(np.random.randint(5, size=(2, 2)))

array([[ 0.36, -0.09],
       [-0.09,  0.27]])

Element-wise array-array operations: the default behaviour is element-wise operations:

In [305]:
M*N

array([[ 3,  1, 16,  6],
       [ 4,  6, 16,  1]])

In [306]:
M + N

array([[4, 2, 8, 5],
       [5, 5, 8, 2]])

Dot Product

In [308]:
dot(M,N.T)

array([[26, 21],
       [34, 27]])