In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame(np.random.rand(5,2), columns=["A", "B"])
#  5,2 짜리 행렬을 만드는데 컬럼명은 "A"와 "B"로 한다.

print(df)
print()

df["A"] < 0.5
# df의 컬럼 "A" 값 중 0.5 보다 작은 값만 True

          A         B
0  0.810898  0.686218
1  0.809572  0.490291
2  0.220597  0.665687
3  0.723891  0.618141
4  0.946046  0.755658



0    False
1    False
2     True
3    False
4    False
Name: A, dtype: bool

In [17]:
print(df[(df['A'] < 0.5) & (df['B'] > 0.3)])
print(df.query("A < 0.5 and B > 0.3"))

          A         B
2  0.220597  0.665687
          A         B
2  0.220597  0.665687


In [18]:
# 문자열 조건 검색

data = {
    "Animal" : ["Dog", "Cat", "Cat", "Pig", "Cat"],
    "name" : ["Happy", "Sam", "Tom", "Mini", "Rocky"]
}

print(data)

{'Animal': ['Dog', 'Cat', 'Cat', 'Pig', 'Cat'], 'name': ['Happy', 'Sam', 'Tom', 'Mini', 'Rocky']}


In [19]:
df = pd.DataFrame(data)
print(df)

  Animal   name
0    Dog  Happy
1    Cat    Sam
2    Cat    Tom
3    Pig   Mini
4    Cat  Rocky


In [20]:
# 2가지 방법

# 1) contanins()
print(df["Animal"].str.contains("Cat"))

print()
# 2) match()
print(df.Animal.str.match("Cat"))

0    False
1     True
2     True
3    False
4     True
Name: Animal, dtype: bool

0    False
1     True
2     True
3    False
4     True
Name: Animal, dtype: bool


In [22]:
condition = df["Animal"].str.contains("Cat")

df[condition]
# 조건으로 사용 가능

Unnamed: 0,Animal,name
1,Cat,Sam
2,Cat,Tom
4,Cat,Rocky


In [23]:
# 대소문자를 무시하고 문자열 조건 검색

df

Unnamed: 0,Animal,name
0,Dog,Happy
1,Cat,Sam
2,Cat,Tom
3,Pig,Mini
4,Cat,Rocky


In [24]:
df['Animal']

0    Dog
1    Cat
2    Cat
3    Pig
4    Cat
Name: Animal, dtype: object

In [26]:
# 대소문자를 무시하고 문자열 조건 검색

df['Animal'].str.contains("cat", case=False)

0    False
1     True
2     True
3    False
4     True
Name: Animal, dtype: bool

In [6]:
# 함수로 데이터 처리하기
# apply : 함수 받아서 적용함

df = pd.DataFrame(np.arange(5), columns=["Num"])

def squere(x):
    return x**2

# apply(함수명) >> 함수 사용하기
print(df["Num"].apply(squere))

df["Square"] = df.Num.apply(lambda x : x ** 2)

print(df)

0     0
1     1
2     4
3     9
4    16
Name: Num, dtype: int64
   Num  Square
0    0       0
1    1       1
2    2       4
3    3       9
4    4      16


In [9]:
df = pd.DataFrame(columns=["phone"])
df.loc[0] = "010-1234-1235"
df.loc[1] = "공일공-일이삼사-1235"
df.loc[2] = "010.1234.일이삼오"
df.loc[3] = "공1공-1234.1이3오"
df["preprocess_phone"] = ""

print(df)

           phone preprocess_phone
0  010-1234-1235                 
1  공일공-일이삼사-1235                 
2  010.1234.일이삼오                 
3  공1공-1234.1이3오                 


In [10]:
def get_preprocess_phone(phone):
    mapping_dict = {
        "공" : "0",
        "일" : "1",
        "이" : "2",
        "삼" : "3",
        "사" : "4",
        "오" : "5",
        "-" : "",
        "." : ""
    }
    for key, value in mapping_dict.items():
        phone = phone.replace(key, value)
    return phone

df['preprocess_phon'] = df['phone'].apply(get_preprocess_phone)
print(df)

           phone preprocess_phone preprocess_phon
0  010-1234-1235                      01012341235
1  공일공-일이삼사-1235                      01012341235
2  010.1234.일이삼오                      01012341235
3  공1공-1234.1이3오                      01012341235


In [11]:
df = pd.DataFrame(columns=["Sex"])
df.loc[0] = 'Male'
df.loc[1] = 'Male'
df.loc[2] = 'Female'
df.loc[3] = 'Female'
df.loc[4] = 'Male'

In [12]:
df

Unnamed: 0,Sex
0,Male
1,Male
2,Female
3,Female
4,Male


In [14]:
print(df)

df.Sex.replace({"Male" : 0, "Female" : 1})
df.Sex.replace({"Male" : 0, "Female" : 1}, inplace=True)

print(df)

      Sex
0       0
1       0
2  Female
3  Female
4       0
   Sex
0    0
1    0
2    1
3    1
4    0


In [28]:
# 그룹으로 묶기
# 조건부로 집계하고 싶은 경우

pd.DataFrame({
            'key' : ['A', 'B', 'C', 'A', 'B', 'C'], 
            'data' : range(6)
              })

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [29]:
df = pd.DataFrame({
            'key' : ['A', 'B', 'C', 'A', 'B', 'C'], 
            'data' : range(6)
              })

df.head(3)

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2


In [30]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000028C0FC68F50>

In [32]:
df.groupby('key').sum()
# 'key' 별로 묶어서 각 key 별로 더해줘

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [33]:
df.groupby(['key', 'data']).sum()

key,data
A,0
A,3
B,1
B,4
C,2
C,5


In [36]:
# aggregate
# groupby 통해서 집계를 한번에 계산하고자 할 때

data = {
    'Group' : ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'data1' : [20,30,40,20,30,40,30,50],
    'data2' : range(1,9)
}

pd.DataFrame(data)

Unnamed: 0,Group,data1,data2
0,A,20,1
1,B,30,2
2,A,40,3
3,B,20,4
4,A,30,5
5,B,40,6
6,A,30,7
7,B,50,8


In [37]:
df = pd.DataFrame(data)

df.head(3)

Unnamed: 0,Group,data1,data2
0,A,20,1
1,B,30,2
2,A,40,3


In [41]:
df.groupby('Group').aggregate(['min', 'median', 'max'])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
Group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,20,30.0,40,1,4.0,7
B,20,35.0,50,2,5.0,8


In [43]:
df.groupby('Group').aggregate({'data1' : 'min', 'data2' : 'sum'})

Unnamed: 0_level_0,data1,data2
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,20,16
B,20,20
