#  강의안4 Introduction to Numpy

In [1]:
import numpy as np

## 1. Numpy 데이터 객체의 특성

### Numpy 데이터 객체는 built-in 리스트와 사칙연산이 다르게 진행된다.

In [2]:
a = [1,2,3]              # Python builit-in list
b = [4,5,6]
a

[1, 2, 3]

In [3]:
type(a)

list

In [4]:
a_arr = np.array(a)    # Numpy nodarray
b_arr = np.array(b)
a_arr

array([1, 2, 3])

In [5]:
type(a_arr)

numpy.ndarray

In [6]:
a + b           # 리스트는 사칙연산이 수행되지 않는다.

[1, 2, 3, 4, 5, 6]

In [7]:
a_arr + b_arr    # Numpy array는 사칙연산이 수행된다.

array([5, 7, 9])

### 동일한 연산에 대해서 Numpy는 빠르게 처리한다.

In [8]:
my_list = list(range(1000000))
%time my_list2 = [x *2 for x in my_list]    # 큰 데이터를 2배 곱하는 연산의 시간 측정

Wall time: 86.9 ms


In [9]:
my_arr = np.arange(1000000)
%time my_arr2 = my_arr * 2

Wall time: 1.02 ms


In [10]:
import sys

In [11]:
sys.getsizeof(my_list2)

8697464

In [12]:
sys.getsizeof(my_arr2)

4000104

## 2.  Numpy 데이터 객체의 생성

### Numpy array() 함수를 이용한  ndarray 생성

In [13]:
x1 = np.array([1, 4, 2, 5, 3])    # create a Numpy ndarray from a list
x1

array([1, 4, 2, 5, 3])

In [14]:
type(x1)

numpy.ndarray

In [15]:
x1.shape

(5,)

In [16]:
x1.dtype

dtype('int32')

In [17]:
x2 = np.array([3.14, 4, 2, 3])   # Numpy upcast if possible
x2

array([3.14, 4.  , 2.  , 3.  ])

In [18]:
x2.dtype

dtype('float64')

In [19]:
L = [[1,2,3,4], [5,6,7,8], [9,10,11,12]]
x3 = np.array(L)     # create an explicit two-dimentional ndarray from a list
x3

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [20]:
type(x3)

numpy.ndarray

In [21]:
x3.shape

(3, 4)

In [22]:
x3.dtype

dtype('int32')

In [23]:
x4 = np.array((1, 2, 3, 4))
x4

array([1, 2, 3, 4])

In [24]:
type(x4)

numpy.ndarray

In [25]:
x5 = np.array(range(5))
x5

array([0, 1, 2, 3, 4])

### 다양한 Numpy 함수들을 이용한 ndarray 생성

In [26]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
np.arange(0, 20, 2)     # Create an array filled with a linear sequence start at 0, end at 20, step by 2

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [28]:
np.zeros(10, dtype=int)    # create a length 10 array filled with 0s

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
np.ones((3,5), dtype=float)     # create 3*5 floating point array filled with 1s

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [30]:
np.full((3,5), 3.14)     # Create 3*5 array filled with 3.14

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [31]:
np.linspace(0, 1)     # Create an array of five values evenly spaced between 0 and 1

array([0.        , 0.02040816, 0.04081633, 0.06122449, 0.08163265,
       0.10204082, 0.12244898, 0.14285714, 0.16326531, 0.18367347,
       0.20408163, 0.2244898 , 0.24489796, 0.26530612, 0.28571429,
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])

### Numpy.random 모듈을 이용한 random number 생성

In [32]:
np.random.rand(5)          # 0과 1사이의 값을 uniform distribution에 기반하여 난수를 생성함

array([0.57902772, 0.83837153, 0.58999887, 0.47442819, 0.47650277])

In [33]:
np.random.rand(3, 5)

array([[0.83602312, 0.28250089, 0.66419645, 0.90301302, 0.11972268],
       [0.27557147, 0.31210702, 0.68370273, 0.18891928, 0.96199474],
       [0.13738643, 0.85195559, 0.70355715, 0.58120047, 0.82099765]])

In [34]:
np.random.randn(5)        # 표준정규분포를 기반으로 난수를 생성함

array([-0.87228317, -0.23599298,  0.87953207, -0.36580809, -0.11363592])

In [35]:
np.random.randn(3,5)

array([[ 0.55652648,  0.42856831, -1.58462753, -0.89425104,  0.81905353],
       [-0.37806773, -0.02382474, -0.07109298, -0.02927363,  0.26683465],
       [ 0.14190477, -0.35016768,  0.47301108,  0.11350103,  0.29076102]])

In [36]:
np.random.normal(50,20,(3,3))    # Create a 3*3 array of random values of normally distribution (mean 50, std 10)

array([[77.86308295, 69.79129843, 30.1939419 ],
       [27.32365235, 45.64165805, 75.13676352],
       [30.69230107, 46.80191053, 38.56397093]])

In [37]:
np.random.randint(0, 10, (3,3))    # Create a 3*3 array of random integers in the interval [0, 10)

array([[8, 0, 5],
       [1, 2, 9],
       [9, 9, 3]])

## 3. Numpy Arrays 기본 작업 (manipulation)

### Attributes of arrays:  size, shape, memory consumption and data types 파악

In [38]:
np.random.seed(0)   # seed for reproducibility
x1 = np.random.randint(10, size=6)         # One-dimensional array
x2 = np.random.randint(10, size=(3,4))     # two-dimensional array
x3 = np.random.randint(10, size=(3,4,5))   # three-dimensinal array

In [39]:
x1

array([5, 0, 3, 3, 7, 9])

In [40]:
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [41]:
x3

array([[[8, 1, 5, 9, 8],
        [9, 4, 3, 0, 3],
        [5, 0, 2, 3, 8],
        [1, 3, 3, 3, 7]],

       [[0, 1, 9, 9, 0],
        [4, 7, 3, 2, 7],
        [2, 0, 0, 4, 5],
        [5, 6, 8, 4, 1]],

       [[4, 9, 8, 1, 1],
        [7, 9, 9, 3, 6],
        [7, 2, 0, 3, 5],
        [9, 4, 4, 6, 4]]])

In [42]:
print("x3 ndim: ", x3.ndim)     # x3 ndarray의 차원 속성값
print("x3 shape: ", x3.shape)   # x3 ndarrary의 각 차원별 형태 속성값
print("x3 size: ", x3.size)     # x3 ndarray의 크기 속성값
print("x3 dtype: ", x3.dtype)   # x3 ndarray element data type 속성값

x3 ndim:  3
x3 shape:  (3, 4, 5)
x3 size:  60
x3 dtype:  int32


In [43]:
x2.shape

(3, 4)

### array Indexing : Accessing Single elements

In [44]:
print(x1[0])        # single dimensional ndarray의 indexing
print(x1[-1])
print(x2[0,0])      # two dimensional ndarray의 indexing
print(x2[2,0])

5
9
3
1


In [45]:
x1[0] = 99      # Numpy element값의 변경
x1

array([99,  0,  3,  3,  7,  9])

### array slicing: accessing subarrays

In [46]:
x1[1:3]     # [start:stop) start는 포함하고(include) stop은 포함하지 않는가 (exclude)   

array([0, 3])

In [47]:
x1[:3]

array([99,  0,  3])

In [48]:
x1[0:5:2]     #slicing을 [start:stop:step] 형식으로 지정한다.

array([99,  3,  7])

In [49]:
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [50]:
x2[:2, :3]     # multi-dimesion을 콤마(,)구분하여 지정한다.

array([[3, 5, 2],
       [7, 6, 8]])

In [51]:
x2[:, 0]      # first column of x2

array([3, 7, 1])

In [52]:
x2[0, :]     # first row of x2

array([3, 5, 2, 4])

In [53]:
x2[0]      # equivalent to x2[0,:]

array([3, 5, 2, 4])

#### slicing된 부분에 새로운 값을 할당하는 경우

In [54]:
x2[1:3, :2] = 0    # slice된 부분에 새로운 값 할당 (broadcasing됨)
x2

array([[3, 5, 2, 4],
       [0, 0, 8, 8],
       [0, 0, 7, 7]])

In [55]:
x2_sub = x2[:2, :2]    # slicing 결과를 새로운 이름으로 할당하여도 copy가 아니라 memory주소가 지정된다.
x2_sub

array([[3, 5],
       [0, 0]])

In [56]:
x2_sub[0,0] = 99
x2_sub

array([[99,  5],
       [ 0,  0]])

In [57]:
x2

array([[99,  5,  2,  4],
       [ 0,  0,  8,  8],
       [ 0,  0,  7,  7]])

In [58]:
x2_sub_copy = x2[:2, :2].copy()   # copy() 메서드를 이용하면 별도의 변수로 지정이 된다.
x2_sub_copy

array([[99,  5],
       [ 0,  0]])

In [59]:
x2_sub_copy[0,0] = 42

In [60]:
x2_sub_copy

array([[42,  5],
       [ 0,  0]])

In [61]:
x2

array([[99,  5,  2,  4],
       [ 0,  0,  8,  8],
       [ 0,  0,  7,  7]])

bulit-in 리스트에서도 할당을 하면 copy가 만들어지지 않고 이름은 다르지만 동일한 memory 주소를 지정함

In [62]:
a = [1, 2, 3]
b = a
b[0] = 99
b

[99, 2, 3]

In [63]:
a

[99, 2, 3]

In [64]:
id(a)

1961039093960

In [65]:
id(b)

1961039093960

In [66]:
b = a.copy()
b[0] = 44
b

[44, 2, 3]

In [67]:
a

[99, 2, 3]

In [68]:
id(b)

1961039148552

In [69]:
id(a)

1961039093960

### Boolean Selection

In [70]:
data = np.arange(10)
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [71]:
data[data > 5]

array([6, 7, 8, 9])

In [73]:
data > 5

array([False, False, False, False, False, False,  True,  True,  True,
        True])

### Reshaping of Arrays

In [74]:
grid = np.arange(1,10).reshape((3,3))
grid

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [75]:
x = np.array([1,2,3])
x

array([1, 2, 3])

In [77]:
y = x.reshape((3,1))
y

array([[1],
       [2],
       [3]])

In [78]:
x.shape

(3,)

In [79]:
x.ndim

1

In [80]:
y.shape

(3, 1)

In [81]:
y.ndim

2

In [82]:
x = np.array([[1,2,3]])
x

array([[1, 2, 3]])

In [83]:
x.shape

(1, 3)

In [84]:
x.ndim

2

In [85]:
x = np.array([1,2,3])
x

array([1, 2, 3])

In [86]:
x.shape

(3,)

In [87]:
x.ndim

1

In [88]:
y = x.reshape((1,3))
y

array([[1, 2, 3]])

In [89]:
y.shape

(1, 3)

In [90]:
y.ndim

2

### Array Concatenation and Splitting

In [91]:
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
np.concatenate((x, y))  

array([1, 2, 3, 4, 5, 6])

In [92]:
z = np.array([99, 99, 99])
np.concatenate((x, y, z))

array([ 1,  2,  3,  4,  5,  6, 99, 99, 99])

In [93]:
np.vstack((x, y))

array([[1, 2, 3],
       [4, 5, 6]])

2차원 Numpy array의 결합

In [94]:
grid = np.array([[1,2,3],
                [4,5,6]])
grid

array([[1, 2, 3],
       [4, 5, 6]])

In [95]:
np.concatenate([grid, grid])     # concatenate along the first axis (down the rows)

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [96]:
np.concatenate([grid, grid], axis=1)    # concateante along the second axis (across the columns)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [97]:
np.concatenate([grid, grid], axis = None)

array([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6])

#### array 분할 (split)

In [98]:
x = np.array([1,2,3,99,99,99, 3,2,1])
np.split(x, 3)

[array([1, 2, 3]), array([99, 99, 99]), array([3, 2, 1])]

In [99]:
x = np.array([1,2,3,99,99,99,3,2,1])
np.split(x, [3,5])       # x[3], x[5]를 split 함

[array([1, 2, 3]), array([99, 99]), array([99,  3,  2,  1])]

In [100]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [102]:
# np.split(x, 3)    

In [103]:
np.split(x, [3,5])

[array([0, 1, 2]), array([3, 4]), array([5, 6, 7, 8, 9])]

In [104]:
grid = np.arange(16).reshape((4,4))
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [105]:
np.split(grid, 2)

[array([[0, 1, 2, 3],
        [4, 5, 6, 7]]),
 array([[ 8,  9, 10, 11],
        [12, 13, 14, 15]])]

In [106]:
upper, lower = np.vsplit(grid, 2)

In [107]:
upper

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [108]:
lower

array([[ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [109]:
left, right = np.hsplit(grid, 2)

In [110]:
left

array([[ 0,  1],
       [ 4,  5],
       [ 8,  9],
       [12, 13]])

In [111]:
np.split(grid, 2, axis=1)      # axis를 지정하면 row 또는 column을 지정할 수 있다.

[array([[ 0,  1],
        [ 4,  5],
        [ 8,  9],
        [12, 13]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]])]

## 4. Computation on Numpy Arrays

### 수식을 이용한 Numpy vectorized operation 

#### reciprocal (1/x) 값 구하기

In [112]:
np.random.seed(0)

def compute_reciprocals(values):         # 1/value를 구하는 함수를 정의한다. (Python built-in 방식)
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0/values[i]
    return output

values = np.random.randint(1,10, size=5)
values

array([6, 1, 4, 4, 8])

In [113]:
compute_reciprocals(values)

array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])

In [114]:
1 / values              #  Numpy vecotized operations을 이용한 사칙연산

array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])

### Numpy vectorization 방식은 매우 빠르다

In [115]:
big_array = np.random.randint(1,100, size=1000000)   
%time compute_reciprocals(big_array)      # 큰 파일을 이용하여 처리시간을 비교한다. (엄청 걸린다~~~)

Wall time: 2.94 s


array([0.1       , 0.01190476, 0.04545455, ..., 0.01428571, 0.01098901,
       0.01149425])

In [116]:
%time (1.0 / big_array)      # 빨리 종료하지요(!!!)

Wall time: 3.82 ms


array([0.1       , 0.01190476, 0.04545455, ..., 0.01428571, 0.01098901,
       0.01149425])

### Numpy array 수리 연산

In [117]:
x = np.arange(4)
x

array([0, 1, 2, 3])

In [118]:
x + 5    #  vectorized operation with arithmetic operators

array([5, 6, 7, 8])

In [119]:
np.add(x, 10)    # Numpy warpper for arithmetic operators

array([10, 11, 12, 13])

In [120]:
y = [0,1,2,3]     # Numpy array가 아니라 list를 사칙연산하면 에러가 나온다.
# y + 5

In [121]:
x - 5     # np.substract()

array([-5, -4, -3, -2])

In [122]:
x * 5     # np.multiply()

array([ 0,  5, 10, 15])

In [123]:
x / 5     # np.divide()

array([0. , 0.2, 0.4, 0.6])

In [124]:
x ** 2     # np.power()

array([0, 1, 4, 9], dtype=int32)

In [125]:
x // 2     # np.floor_divide

array([0, 0, 1, 1], dtype=int32)

In [126]:
x % 2     # np.mod()

array([0, 1, 0, 1], dtype=int32)

In [127]:
-(0.5*x + 1) ** 2

array([-1.  , -2.25, -4.  , -6.25])

In [128]:
x1 = np.arange(5)
x1

array([0, 1, 2, 3, 4])

In [129]:
x2 = np.arange(1,6)
x2

array([1, 2, 3, 4, 5])

In [130]:
x1 + x2      # vectorized opeation between two arrays

array([1, 3, 5, 7, 9])

In [131]:
x1 * x2

array([ 0,  2,  6, 12, 20])

In [132]:
x3 = np.arange(1,5)
x3

array([1, 2, 3, 4])

In [134]:
# x1 + x3      # element 숫자가 맞지 않으면 error가 발생한다

### Numpy Universal functions

In [135]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [136]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [137]:
np.square(arr)

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)

In [138]:
arr ** 2

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)

In [139]:
np.sum(arr)

45

In [140]:
np.mean(arr)

4.5

In [141]:
np.var(arr)

8.25

In [142]:
np.max(arr)

9

In [143]:
sum(arr)           # Python built-in 함수를 이용해도 동일한 결과를 얻는다.

45

#### Python built-in 함수를 사용하는 것 보다 더 빠르게 처리된다.

In [144]:
big_array = np.random.rand(1000000)

In [145]:
%time sum(big_array)     # with built-in Python sum()

Wall time: 116 ms


500222.47258859215

In [146]:
%time np.sum(big_array)    # with Numpy sum() universal function

Wall time: 998 µs


500222.4725885841

In [147]:
%time min(big_array)

Wall time: 81.4 ms


7.071203171893359e-07

In [148]:
%time np.min(big_array)

Wall time: 979 µs


7.071203171893359e-07

### Numpy universal function vs. method 사용방식

In [149]:
arr = np.arange(0,12).reshape(3,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [150]:
arr.sum()     # method 방식을 이용함

66

In [151]:
np.sum(arr)    # universal function sum() 함수가 수행된다.

66

In [152]:
arr.mean()

5.5

In [153]:
np.mean(arr)

5.5

### axis를 지정하여 연산을 수행하는 경우

In [154]:
np.sum(arr, axis=0)    # down the rows

array([12, 15, 18, 21])

In [155]:
np.sum(arr, axis=1)   # across the columns

array([ 6, 22, 38])

In [156]:
np.sum(arr, axis=None)

66

In [157]:
arr.min(axis=0)       # find the minimum value within each column by specifying axis=0

array([0, 1, 2, 3])

In [158]:
np.min(arr, axis=0)

array([0, 1, 2, 3])

In [159]:
arr.min()          # axis가 지정되지 않으면 default값은 None으로 사용된다

0

In [160]:
np.min(arr, axis=1)   # find the minimum value within each row by specifying axis=1

array([0, 4, 8])

## 5. Computation on Arrays: Broadcasting

In [161]:
a = np.array([0, 1, 2])
b = np.array([5, 5, 5])
a + b

array([5, 6, 7])

In [162]:
a + 5

array([5, 6, 7])

In [163]:
arr = np.ones((3,3))
arr

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [164]:
arr + a

array([[1., 2., 3.],
       [1., 2., 3.],
       [1., 2., 3.]])

In [165]:
a = np.arange(3)
b = np.arange(3).reshape((3,1))
print(a)
print(b)
print(a.shape)
print(b.shape)

[0 1 2]
[[0]
 [1]
 [2]]
(3,)
(3, 1)


In [166]:
a + b

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4]])

In [167]:
c = np.array([0,1,2,3])
c

array([0, 1, 2, 3])

In [168]:
# a + c    # R과 다른 방식으로 처리됨

ValueError: operands could not be broadcast together with shapes (3,) (4,) 

In [169]:
b + c

array([[0, 1, 2, 3],
       [1, 2, 3, 4],
       [2, 3, 4, 5]])

In [1]:
import numpy as np

In [3]:
a_arr= np.arange(6)
a_arr

array([0, 1, 2, 3, 4, 5])

In [4]:
b_arr = np.arange(6,10)
b_arr

array([6, 7, 8, 9])

In [5]:
a_arr + b_arr

ValueError: operands could not be broadcast together with shapes (6,) (4,) 