<a href="https://colab.research.google.com/github/kgpark88/bigdata/blob/main/code/Pandas_Operation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 연산(Operation)

In [1]:
import pandas as pd
df_one = pd.DataFrame({'k1':['A','A','B','B','C','C'],
                      'col1':[100,200,300,300,400,500],
                      'col2':['NY','CA','WA','WA','AK','NV']})


In [2]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


### Unique Value

In [3]:
df_one['col2'].unique()

array(['NY', 'CA', 'WA', 'AK', 'NV'], dtype=object)

In [4]:
df_one['col2'].nunique()

5

In [5]:
df_one['col2'].value_counts()

WA    2
NY    1
CA    1
AK    1
NV    1
Name: col2, dtype: int64

In [6]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


In [7]:
df_one.drop_duplicates()

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
4,C,400,AK
5,C,500,NV


### Column 생성

In [8]:
df_one

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


In [9]:
df_one['New Col'] = df_one['col1'] * 10

In [10]:
df_one

Unnamed: 0,k1,col1,col2,New Col
0,A,100,NY,1000
1,A,200,CA,2000
2,B,300,WA,3000
3,B,300,WA,3000
4,C,400,AK,4000
5,C,500,NV,5000


### Custom function 사용

In [11]:
def grab_first_letter(state):
    # first letter 리턴
    return state[0]

In [12]:
grab_first_letter('NY')

'N'

In [13]:
df_one['col2'].apply(grab_first_letter)

0    N
1    C
2    W
3    W
4    A
5    N
Name: col2, dtype: object

In [14]:
df_one['first letter'] = df_one['col2'].apply(grab_first_letter)

In [15]:
df_one

Unnamed: 0,k1,col1,col2,New Col,first letter
0,A,100,NY,1000,N
1,A,200,CA,2000,C
2,B,300,WA,3000,W
3,B,300,WA,3000,W
4,C,400,AK,4000,A
5,C,500,NV,5000,N


In [16]:
def complex_letter(state):

    if state[0] == "W":
        return "Washington"
    else:
        return 'Error'

In [17]:
df_one['State Check'] = df_one['col2'].apply(complex_letter)

In [18]:
df_one

Unnamed: 0,k1,col1,col2,New Col,first letter,State Check
0,A,100,NY,1000,N,Error
1,A,200,CA,2000,C,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington
4,C,400,AK,4000,A,Error
5,C,500,NV,5000,N,Error


### 맵핑(Mapping)

In [19]:
df_one['k1']

0    A
1    A
2    B
3    B
4    C
5    C
Name: k1, dtype: object

In [20]:
df_one['k1'].map({'A':1,'B':2,'C':3})

0    1
1    1
2    2
3    2
4    3
5    3
Name: k1, dtype: int64

###  최대값, 최소값 인덱스 위치 구하기

In [21]:
df_one

Unnamed: 0,k1,col1,col2,New Col,first letter,State Check
0,A,100,NY,1000,N,Error
1,A,200,CA,2000,C,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington
4,C,400,AK,4000,A,Error
5,C,500,NV,5000,N,Error


In [22]:
df_one['col1'].max()

500

In [23]:
df_one['col1'].min()

100

In [24]:
df_one['col1'].idxmin()

0

In [25]:
df_one['col1'].idxmax()

5

### column, index 이름 구하기

In [26]:
df_one.columns

Index(['k1', 'col1', 'col2', 'New Col', 'first letter', 'State Check'], dtype='object')

In [27]:
df_one.index

RangeIndex(start=0, stop=6, step=1)

In [28]:
df_one.columns = ['C1','C2','C3','C4','C5','C6']

In [29]:
df_one

Unnamed: 0,C1,C2,C3,C4,C5,C6
0,A,100,NY,1000,N,Error
1,A,200,CA,2000,C,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington
4,C,400,AK,4000,A,Error
5,C,500,NV,5000,N,Error


### DataFrame 정렬하기

In [30]:
df_one

Unnamed: 0,C1,C2,C3,C4,C5,C6
0,A,100,NY,1000,N,Error
1,A,200,CA,2000,C,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington
4,C,400,AK,4000,A,Error
5,C,500,NV,5000,N,Error


In [31]:
df_one.sort_values('C3')

Unnamed: 0,C1,C2,C3,C4,C5,C6
4,C,400,AK,4000,A,Error
1,A,200,CA,2000,C,Error
5,C,500,NV,5000,N,Error
0,A,100,NY,1000,N,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington


#  DataFrame 연결하기(Concatenating)

In [32]:
features = pd.DataFrame({'A':[100, 200, 300, 400, 500],
                        'B':[12, 13, 14, 15, 16]})
predictions = pd.DataFrame({'pred':[0, 1, 1, 0, 1]})

In [33]:
features

Unnamed: 0,A,B
0,100,12
1,200,13
2,300,14
3,400,15
4,500,16


In [34]:
predictions

Unnamed: 0,pred
0,0
1,1
2,1
3,0
4,1


In [35]:
# Pay careful attention to the axis parameter!
pd.concat([features,predictions])

Unnamed: 0,A,B,pred
0,100.0,12.0,
1,200.0,13.0,
2,300.0,14.0,
3,400.0,15.0,
4,500.0,16.0,
0,,,0.0
1,,,1.0
2,,,1.0
3,,,0.0
4,,,1.0


In [36]:
pd.concat([features,predictions],axis=1)

Unnamed: 0,A,B,pred
0,100,12,0
1,200,13,1
2,300,14,1
3,400,15,0
4,500,16,1


## Dummy Variable 생성

In [37]:
df_one

Unnamed: 0,C1,C2,C3,C4,C5,C6
0,A,100,NY,1000,N,Error
1,A,200,CA,2000,C,Error
2,B,300,WA,3000,W,Washington
3,B,300,WA,3000,W,Washington
4,C,400,AK,4000,A,Error
5,C,500,NV,5000,N,Error


In [38]:
df_one['C1']

0    A
1    A
2    B
3    B
4    C
5    C
Name: C1, dtype: object

In [39]:
pd.get_dummies(df_one['C1'])

Unnamed: 0,A,B,C
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,0,1
