In [1]:
import pandas as pd
import numpy as np

In [6]:
df = pd.DataFrame(np.random.rand(5,2), columns = ["a", 'b'])
df

Unnamed: 0,a,b
0,0.247765,0.155888
1,0.281655,0.276437
2,0.294538,0.729204
3,0.138539,0.989076
4,0.998932,0.878285


In [7]:
df['a'] < 0.5

0     True
1     True
2     True
3     True
4    False
Name: a, dtype: bool

In [8]:
df[df['a'] < 0.5]

Unnamed: 0,a,b
0,0.247765,0.155888
1,0.281655,0.276437
2,0.294538,0.729204
3,0.138539,0.989076


In [10]:
df[(df['a'] < 0.5) & (df['b'] > 0.3)]

Unnamed: 0,a,b
2,0.294538,0.729204
3,0.138539,0.989076


In [11]:
df.query('a < 0.5 and b > 0.3')

Unnamed: 0,a,b
2,0.294538,0.729204
3,0.138539,0.989076


In [15]:
df = pd.DataFrame(np.arange(5), columns = ['num'])
df

Unnamed: 0,num
0,0
1,1
2,2
3,3
4,4


In [17]:
def square(x):
    return x**2

df['num'].apply(square)

0     0
1     1
2     4
3     9
4    16
Name: num, dtype: int64

In [20]:
df['square'] = df.num.apply(lambda x : x**2)
df

Unnamed: 0,num,square
0,0,0
1,1,1
2,2,4
3,3,9
4,4,16


In [21]:
df = pd.DataFrame(columns=["phone"])
df.loc[0] = "010-1234-1235"
df.loc[1] = "공일공-일이삼사-1235"
df.loc[2] = "010.1234.일이삼오"
df.loc[3] = "공1공-1234.1이3오"
df["preprocess_phone"] = ''

In [22]:
df

Unnamed: 0,phone,preprocess_phone
0,010-1234-1235,
1,공일공-일이삼사-1235,
2,010.1234.일이삼오,
3,공1공-1234.1이3오,


In [32]:
def get_preprocess_phone(phone):
    mapping_dict = {
        "공" : "0",
        "일" : "1",
        "이" : "2",
        "삼" : "3",
        "사" : "4",
        "오" : "5",
        "육" : "6",
        "칠" : "7",
        "팔" : "8",
        "구" : "9",
        "-" : "",
        "." : "",
    }
    for key, value in mapping_dict.items(): # items 두 가지 인자를 동시에 받을때
        phone = phone.replace(key, value)
    return phone

In [34]:
df['preprocess_phone'] = df['phone'].apply(get_preprocess_phone) # apply 열(옆으로) 붙여라
df

Unnamed: 0,phone,preprocess_phone
0,010-1234-1235,1012341235
1,공일공-일이삼사-1235,1012341235
2,010.1234.일이삼오,1012341235
3,공1공-1234.1이3오,1012341235


In [47]:
df = pd.DataFrame({
    "key" : ['a', 'b', 'c', 'a', 'b', 'c'],
    "data1" : range(6),
    "data2" : range(6)
})
df

Unnamed: 0,key,data1,data2
0,a,0,0
1,b,1,1
2,c,2,2
3,a,3,3
4,b,4,4
5,c,5,5


In [49]:
df.groupby("key").sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,3
b,5,5
c,7,7


In [50]:
df.groupby("key").mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.5,1.5
b,2.5,2.5
c,3.5,3.5


In [52]:
df.groupby(["key", "data1"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key,data1,Unnamed: 2_level_1
a,0,0
a,3,3
b,1,1
b,4,4
c,2,2
c,5,5


In [53]:
df

Unnamed: 0,key,data1,data2
0,a,0,0
1,b,1,1
2,c,2,2
3,a,3,3
4,b,4,4
5,c,5,5


In [56]:
df.groupby('key').aggregate(['min',np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,0,1.5,3,0,1.5,3
b,1,2.5,4,1,2.5,4
c,2,3.5,5,2,3.5,5


In [60]:
df.groupby('key').aggregate({'data1' : 'min', 'data2' : np.sum})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,3
b,1,5
c,2,7


In [61]:
def filter_by_mean(x):
    return x['data2'].mean() > 3

In [62]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.5,1.5
b,2.5,2.5
c,3.5,3.5


In [65]:
df.groupby('key').filter(filter_by_mean)

Unnamed: 0,key,data1,data2
2,c,2,2
5,c,5,5


In [67]:
df.groupby('key').apply(lambda x : x.max() - x.min())

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,3
b,3,3
c,3,3


In [69]:
df = pd.DataFrame(
np.random.randn(4, 2), 
index=[['A', 'A', 'B', 'B'], [1, 2, 1, 2]],
columns=['data1', 'data2']
)

In [70]:
df

Unnamed: 0,Unnamed: 1,data1,data2
A,1,0.651973,-1.544354
A,2,0.866896,0.10443
B,1,0.578769,0.907317
B,2,-0.759708,0.550579


In [73]:
df = pd.DataFrame(
    np.random.randn(4,4),
    columns = [['a', 'a', 'b', 'b'], ['1', '2', '1', '2']]
)
df

Unnamed: 0_level_0,a,a,b,b
Unnamed: 0_level_1,1,2,1,2
0,0.257907,0.837836,0.182025,0.994192
1,-0.313646,1.879702,0.059095,0.75477
2,0.353064,-0.692317,0.314631,-0.475662
3,-0.567823,0.18996,1.311492,-0.455026


In [75]:
df['a']['1']

0    0.257907
1   -0.313646
2    0.353064
3   -0.567823
Name: 1, dtype: float64