# 헷갈리는 문법 정리하기

## map, apply, applymap

In [2]:
import pandas as pd

data = {'a ' : [1,3,7,4],
        'b': [17,86,52,68],
        'c': [134,874,592,246]}

df1 = pd.DataFrame(data)
df1

Unnamed: 0,a,b,c
0,1,17,134
1,3,86,874
2,7,52,592
3,4,68,246


### 1. map 함수
* 반드시 Series 타입에서만 사용
* Series = value + index

In [None]:
f1 = lambda x: '%03d' %x

## * apply / asapply

In [1]:
import numpy as np

a = np.array(np.arange(20)).reshape(4,5)
a

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [2]:
np.sum(a, axis= 0) #shape = (5,)

array([30, 34, 38, 42, 46])

In [3]:
np.sum(a, axis=0, keepdims= True) #shape= (1,5)

array([[30, 34, 38, 42, 46]])

## * array / asarray

In [18]:
arr = np.ones((3,4))
arr

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [19]:
arrB = np.asarray(arr) #참조본 생성
arrC= np.array(arr) #복사본 생성

In [20]:
arr[1]= np.pi
arr

array([[1.        , 1.        , 1.        , 1.        ],
       [3.14159265, 3.14159265, 3.14159265, 3.14159265],
       [1.        , 1.        , 1.        , 1.        ]])

In [21]:
arrB #asarray은 자동으로 변경된다.

array([[1.        , 1.        , 1.        , 1.        ],
       [3.14159265, 3.14159265, 3.14159265, 3.14159265],
       [1.        , 1.        , 1.        , 1.        ]])

In [22]:
arrC #array는 변경되지 않는다.

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

## * append / extend
* append :  목록 끝에 '단일 요소'로 추가한다. => 목록 자체의 길이는 1씩 증가
            개체가 숫자, 문자, 다른 목록 또는 기타 무엇이든 관계없이 끝에 추가된다.
* extend :  iterable 에서 '요소 목록'을 추가하여 목록을 확장한다. => 목록 길이는 반복 가능한 인수씩

### append

In [27]:
x = [1,2,3]
x.append([4,5])
x

[1, 2, 3, [4, 5]]

In [28]:
x.append(['very', 'good'])
x

[1, 2, 3, [4, 5], ['very', 'good']]

### extend

In [29]:
x = [1,2,3]
x.extend([4,5])
x

[1, 2, 3, 4, 5]

In [30]:
x.extend(['very', 'good'])
x

[1, 2, 3, 4, 5, 'very', 'good']

## * multiply / matmul / dot

In [42]:
a = np.array([1,2,3,4,5,6]).reshape(2,3)
b = np.array([7,8,9,10,11,12]).reshape(2,3)
print(a,'\n')
print(b, '\n')
print(b.T,)

[[1 2 3]
 [4 5 6]] 

[[ 7  8  9]
 [10 11 12]] 

[[ 7 10]
 [ 8 11]
 [ 9 12]]


### multiply: 똑같은 shape 일 때 가능

In [43]:
np.multiply(a,b)

array([[ 7, 16, 27],
       [40, 55, 72]])

### matmul: 2차원 이상일 때 (스칼라 곱셈 X)

In [45]:
np.matmul(a,b.T)

array([[ 50,  68],
       [122, 167]])

### dot: 스칼라로 곱해도 된다.

In [46]:
np.dot(a,b.T)

array([[ 50,  68],
       [122, 167]])

### tensorflow.keras로 구현(3D이어야 한다.)

## * stack(), unstack()

In [80]:
import numpy as np
import pandas as pd

In [102]:
mul_index = pd.MultiIndex.from_tuples([('cust_1', '2015'), ('cust_1', '2016'),('cust_2', '2015'), ('cust_2', '2016')])
data = pd.DataFrame(data=np.arange(16).reshape(4, 4),index=mul_index,columns=['prd_1', 'prd_2', 'prd_3', 'prd_4'], dtype='int')

print(mul_index)
print(data)

MultiIndex([('cust_1', '2015'),
            ('cust_1', '2016'),
            ('cust_2', '2015'),
            ('cust_2', '2016')],
           )
             prd_1  prd_2  prd_3  prd_4
cust_1 2015      0      1      2      3
       2016      4      5      6      7
cust_2 2015      8      9     10     11
       2016     12     13     14     15


In [88]:
#stack
data_stacked = data.stack()
data_stacked

cust_1  2015  prd_1     0
              prd_2     1
              prd_3     2
              prd_4     3
        2016  prd_1     4
              prd_2     5
              prd_3     6
              prd_4     7
cust_2  2015  prd_1     8
              prd_2     9
              prd_3    10
              prd_4    11
        2016  prd_1    12
              prd_2    13
              prd_3    14
              prd_4    15
dtype: int32

In [93]:
# DataFrame.stack() -> returns Series
data_stacked.index

MultiIndex([('cust_1', '2015', 'prd_1'),
            ('cust_1', '2015', 'prd_2'),
            ('cust_1', '2015', 'prd_3'),
            ('cust_1', '2015', 'prd_4'),
            ('cust_1', '2016', 'prd_1'),
            ('cust_1', '2016', 'prd_2'),
            ('cust_1', '2016', 'prd_3'),
            ('cust_1', '2016', 'prd_4'),
            ('cust_2', '2015', 'prd_1'),
            ('cust_2', '2015', 'prd_2'),
            ('cust_2', '2015', 'prd_3'),
            ('cust_2', '2015', 'prd_4'),
            ('cust_2', '2016', 'prd_1'),
            ('cust_2', '2016', 'prd_2'),
            ('cust_2', '2016', 'prd_3'),
            ('cust_2', '2016', 'prd_4')],
           )

In [92]:
data_stacked['cust_1']['2015']['prd_1']

0

#### stack(dropna=True, False)
결측값이 있는 데이터셋을 stack()할 때 결측값을 제거할지(dropna= True), 아니면 결측값을 NaN으로 유지할지(dropna=False)


In [103]:
data

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [104]:
data.loc['cust_2','prd_4']=np.nan
data

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2015,0,1,2,3.0
cust_1,2016,4,5,6,7.0
cust_2,2015,8,9,10,
cust_2,2016,12,13,14,


In [105]:
data.stack(dropna=False)

cust_1  2015  prd_1     0.0
              prd_2     1.0
              prd_3     2.0
              prd_4     3.0
        2016  prd_1     4.0
              prd_2     5.0
              prd_3     6.0
              prd_4     7.0
cust_2  2015  prd_1     8.0
              prd_2     9.0
              prd_3    10.0
              prd_4     NaN
        2016  prd_1    12.0
              prd_2    13.0
              prd_3    14.0
              prd_4     NaN
dtype: float64

In [106]:
data.stack(dropna=True)

cust_1  2015  prd_1     0.0
              prd_2     1.0
              prd_3     2.0
              prd_4     3.0
        2016  prd_1     4.0
              prd_2     5.0
              prd_3     6.0
              prd_4     7.0
cust_2  2015  prd_1     8.0
              prd_2     9.0
              prd_3    10.0
        2016  prd_1    12.0
              prd_2    13.0
              prd_3    14.0
dtype: float64

#### * unstack(level=-1,0,1  fill_value=None)

In [107]:
data_stacked

cust_1  2015  prd_1     0
              prd_2     1
              prd_3     2
              prd_4     3
        2016  prd_1     4
              prd_2     5
              prd_3     6
              prd_4     7
cust_2  2015  prd_1     8
              prd_2     9
              prd_3    10
              prd_4    11
        2016  prd_1    12
              prd_2    13
              prd_3    14
              prd_4    15
dtype: int32

In [111]:
data_stacked.unstack(level=-1)

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [113]:
data_stacked.unstack(level=0)

Unnamed: 0,Unnamed: 1,cust_1,cust_2
2015,prd_1,0,8
2015,prd_2,1,9
2015,prd_3,2,10
2015,prd_4,3,11
2016,prd_1,4,12
2016,prd_2,5,13
2016,prd_3,6,14
2016,prd_4,7,15


In [114]:
data_stacked.unstack(level=1)

Unnamed: 0,Unnamed: 1,2015,2016
cust_1,prd_1,0,4
cust_1,prd_2,1,5
cust_1,prd_3,2,6
cust_1,prd_4,3,7
cust_2,prd_1,8,12
cust_2,prd_2,9,13
cust_2,prd_3,10,14
cust_2,prd_4,11,15


In [115]:
data_stacked_unstacked = data_stacked.unstack(level=-1)
data_stacked_unstacked

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [117]:
data_stacked_unstacked_df = data_stacked_unstacked.reset_index()
data_stacked_unstacked_df

Unnamed: 0,level_0,level_1,prd_1,prd_2,prd_3,prd_4
0,cust_1,2015,0,1,2,3
1,cust_1,2016,4,5,6,7
2,cust_2,2015,8,9,10,11
3,cust_2,2016,12,13,14,15


In [125]:
data_stacked_unstacked_df.rename(columns ={'level_0':'custID','level_1':'year'})

Unnamed: 0,custID,year,prd_1,prd_2,prd_3,prd_4
0,cust_1,2015,0,1,2,3
1,cust_1,2016,4,5,6,7
2,cust_2,2015,8,9,10,11
3,cust_2,2016,12,13,14,15


## * loc / iloc

In [3]:
df = pd.DataFrame({"A":[1,4,7], "B":[2,5,8], "C":[3,6,9]})
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [8]:
# Use `iloc[]` to select a row
display(df.iloc[0])
display(df.loc[0])

# Use `loc[]` to select a column
display(df.loc[:,'A'])
display(df['A'])

# 특정 row, column을 선택하기
display(df.loc[0]['B'])


A    1
B    2
C    3
Name: 0, dtype: int64

A    1
B    2
C    3
Name: 0, dtype: int64

0    1
1    4
2    7
Name: A, dtype: int64

0    1
1    4
2    7
Name: A, dtype: int64

2

In [9]:
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [11]:
# loc 사용해 새로운 column 추가하기
for idx in range(len(df)):
    df.loc[idx,'D'] = df.loc[idx,'A']*10 + df.loc[idx,'B'] + df.loc[idx,'C']
df

Unnamed: 0,A,B,C,D
0,1,2,3,15.0
1,4,5,6,51.0
2,7,8,9,87.0


In [13]:
# iloc 사용해 새로운 column 추가하기
for idx in range(len(df)):
    df.loc[idx,'E'] = df.iloc[idx,0]*10 + df.iloc[idx,1] + df.iloc[idx,2]
df

Unnamed: 0,A,B,C,D,E
0,1,2,3,15.0,15.0
1,4,5,6,51.0,51.0
2,7,8,9,87.0,87.0


In [15]:
df.loc[:, 'F'] = pd.Series(['5', '6', '7'])
df

Unnamed: 0,A,B,C,D,E,F
0,1,2,3,15.0,15.0,5
1,4,5,6,51.0,51.0,6
2,7,8,9,87.0,87.0,7


## * merge 다시하기
https://nittaku.tistory.com/121

In [75]:
import pandas as pd
df1 = pd.DataFrame({'data1': range(7),'key': list('bbacaab')})
df2 = pd.DataFrame({"data2": range(3),"key": list('abc')})
print(df1,'\n')
print(df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b 

   data2 key
0      0   a
1      1   b
2      2   c


In [77]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0
6,3,c,2


In [78]:
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0
6,3,c,2


* item()
* value
* map
* counter
* pd.Series
* vstack / hstack
* iloc / loc