# DataFrame 조작하는 방법
- fillna : NaN을 원하는 값으로 변경

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
sample_df = pd.DataFrame({
    'A' : [1,3,4,3,2],
    'B' : [2,3,1,4,2],
    'C' : [1,5,3,4,1]
})
sample_df

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,3
3,3,4,4
4,2,2,1


In [4]:
sample_df.iloc[2,2] = np.nan
sample_df

Unnamed: 0,A,B,C
0,1,2,1.0
1,3,3,5.0
2,4,1,
3,3,4,4.0
4,2,2,1.0


In [5]:
sample_df.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1.0,1.0,1.0,2.0
2.0,1.0,2.0,0.0
3.0,2.0,1.0,0.0
4.0,1.0,1.0,1.0
5.0,0.0,0.0,1.0


- astype : 자료형을 바꾸는 기능

In [6]:
sample_df.apply(pd.value_counts).fillna(0).astype(int)

Unnamed: 0,A,B,C
1.0,1,1,2
2.0,1,2,0
3.0,2,1,0
4.0,1,1,1
5.0,0,0,1


In [7]:
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [14]:
# 타이타닉 승객 중 나이를 명시하지 않은 고객은 나이를 명시한 고객의 평균 나이 값으로 대체
titanic['age'] = titanic['age'].fillna(titanic['age'].mean())
# print('타이타닉 df의 "age" 칼럼 결측치의 수 :', pd.isnull(titanic['age']).sum())
display(titanic['age'].head(20))

# 타이타닉 승객에 대해 나이와 성별에 의한 age_gender_cat 열을 만들고
# 조건 1. 성별을 나타내는 문자열 male 또는 female로 시작한다.
# 조건 2. 성별을 나타내는 문자열 뒤에 나이를 나타내는 문자열로 변경하라
# 예시) 남성의 나이가 27이라면 -> male27

titanic['age_gender_cat'] = titanic['sex'] + titanic['age'].astype(int).apply(str)
display(titanic['age_gender_cat'].head(20))

타이타닉 df의 "age" 칼럼 결측치의 수 : 0


0     22.000000
1     38.000000
2     26.000000
3     35.000000
4     35.000000
5     29.699118
6     54.000000
7      2.000000
8     27.000000
9     14.000000
10     4.000000
11    58.000000
12    20.000000
13    39.000000
14    14.000000
15    55.000000
16     2.000000
17    29.699118
18    31.000000
19    29.699118
Name: age, dtype: float64

0       male22
1     female38
2     female26
3     female35
4       male35
5       male29
6       male54
7        male2
8     female27
9     female14
10     female4
11    female58
12      male20
13      male39
14    female14
15    female55
16       male2
17      male29
18    female31
19    female29
Name: age_gender_cat, dtype: object

## DataFrame index 조작하는 방법
- set_index : 기존 행 인덱스를 제거하고 데이터 열 중 하나를 인덱스로 설정
- reset_index : 기존 행 인덱스를 제거하고 인덱스를 데이터 열 추가

In [15]:
np.random.seed(100)
index_df = pd.DataFrame(np.vstack([list('ABCDE'),
                                  np.round(np.random.rand(3,5),2)]).T,
                        columns = ['col01', 'col02', 'col03', 'col04'])
index_df

Unnamed: 0,col01,col02,col03,col04
0,A,0.54,0.12,0.89
1,B,0.28,0.67,0.21
2,C,0.42,0.83,0.19
3,D,0.84,0.14,0.11
4,E,0.0,0.58,0.22


In [16]:
index_df2 = index_df.set_index('col01')
index_df2

Unnamed: 0_level_0,col02,col03,col04
col01,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.54,0.12,0.89
B,0.28,0.67,0.21
C,0.42,0.83,0.19
D,0.84,0.14,0.11
E,0.0,0.58,0.22


In [17]:
index_df3 = index_df2.set_index('col02')
index_df3

Unnamed: 0_level_0,col03,col04
col02,Unnamed: 1_level_1,Unnamed: 2_level_1
0.54,0.12,0.89
0.28,0.67,0.21
0.42,0.83,0.19
0.84,0.14,0.11
0.0,0.58,0.22


In [18]:
index_df2.reset_index(drop=True)

Unnamed: 0,col02,col03,col04
0,0.54,0.12,0.89
1,0.28,0.67,0.21
2,0.42,0.83,0.19
3,0.84,0.14,0.11
4,0.0,0.58,0.22


In [None]:
# 5명의 학생의 국어, 영어, 수학 점수를 나타내는 데이터프레임을 만든다.
# 1. 
# 학생 이름을 나타내는 열을 포함시키지 않고 
# 데이터프레임 df_score1 을 생성한 후, 
# df_score1.index 속성에 학생 이름을 나타내는 열을 지정하여 인덱스를 지정한다. 
# reset_index 명령으로 이 인덱스 열을 명령으로 
# 일반 데이터열로 바꾸여 데이터프레임 df_score2을 만든다.

# 2.
# 학생 이름을 나타내는 열이 일반 데이터 열을 포함하는 데이터프레임 df_score2에 
# set_index 명령을 적용하여 다시 학생 이름을 나타내는 열을 인덱스로 변경한다.


In [20]:
df_score1 = pd.DataFrame({
    'kor':[90,50,21,45,70],
    'eng':[80,42,42,65,45],
    'math':[45,85,45,25,95],
})
df_score1

Unnamed: 0,kor,eng,math
0,90,80,45
1,50,42,85
2,21,42,45
3,45,65,25
4,70,45,95


In [23]:
df_score1.index = ['Curry', 'Thompson', 'Iguadala', 'Durant', 'Green']
df_score1.index.name = 'name'
df_score1

Unnamed: 0_level_0,kor,eng,math
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Curry,90,80,45
Thompson,50,42,85
Iguadala,21,42,45
Durant,45,65,25
Green,70,45,95


In [25]:
df_score2 = df_score1.reset_index()
df_score2

Unnamed: 0,name,kor,eng,math
0,Curry,90,80,45
1,Thompson,50,42,85
2,Iguadala,21,42,45
3,Durant,45,65,25
4,Green,70,45,95


In [29]:
df_score3 = df_score2.set_index('name')
display(df_score3)

Unnamed: 0_level_0,kor,eng,math
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Curry,90,80,45
Thompson,50,42,85
Iguadala,21,42,45
Durant,45,65,25
Green,70,45,95


- np.random.randint : 정수 난수 1개 생성
- np.random.rand : 0 ~ 1  사이의 분포된 난수 생성
- np.random.randn : 가우시안 표준 정규분포에서 난수 생성

In [92]:
np.random.randint(6) # 0 또는 1~5까지의 랜덤숫자

0

In [31]:
np.random.randint(1, 20) # 1~19

13

In [33]:
np.random.rand(6) # 0~1

array([0.81168315, 0.17194101, 0.81622475, 0.27407375, 0.43170418,
       0.94002982])

In [34]:
np.random.rand(3,2)

array([[0.81764938, 0.33611195],
       [0.17541045, 0.37283205],
       [0.00568851, 0.25242635]])

In [35]:
np.random.randn(6)

array([ 1.61898166,  1.54160517, -0.25187914, -0.84243574,  0.18451869,
        0.9370822 ])

In [36]:
np.random.randn(3,2)

array([[ 0.73100034,  1.36155613],
       [-0.32623806,  0.05567601],
       [ 0.22239961, -1.443217  ]])

## DataFrame merge

In [54]:
data1 = {
    '학번' : [1,2,3,4],
    '이름' : ['섭섭해', '김한준', '김선림', '최호진'],
    '학년' : [2,4,1,3]
}

data2 = {
    '학번' : [1, 2, 4, 5],
    '학과' : ['CS', 'MATH', 'MATH', 'CS'],
    '학점' : [2.4, 4.5, 3.4, 4.2]
}

In [55]:
stu_df   = pd.DataFrame(data1)
major_df = pd.DataFrame(data2)
display(stu_df)
display(major_df)

Unnamed: 0,학번,이름,학년
0,1,섭섭해,2
1,2,김한준,4
2,3,김선림,1
3,4,최호진,3


Unnamed: 0,학번,학과,학점
0,1,CS,2.4
1,2,MATH,4.5
2,4,MATH,3.4
3,5,CS,4.2


In [56]:
pd.merge(stu_df, major_df, on='학번')

Unnamed: 0,학번,이름,학년,학과,학점
0,1,섭섭해,2,CS,2.4
1,2,김한준,4,MATH,4.5
2,4,최호진,3,MATH,3.4


In [57]:
pd.merge(stu_df, major_df, on='학번', how='inner')

Unnamed: 0,학번,이름,학년,학과,학점
0,1,섭섭해,2,CS,2.4
1,2,김한준,4,MATH,4.5
2,4,최호진,3,MATH,3.4


In [58]:
pd.merge(stu_df, major_df, on='학번', how='left')

Unnamed: 0,학번,이름,학년,학과,학점
0,1,섭섭해,2,CS,2.4
1,2,김한준,4,MATH,4.5
2,3,김선림,1,,
3,4,최호진,3,MATH,3.4


In [59]:
pd.merge(stu_df, major_df, on='학번', how='right')

Unnamed: 0,학번,이름,학년,학과,학점
0,1,섭섭해,2.0,CS,2.4
1,2,김한준,4.0,MATH,4.5
2,4,최호진,3.0,MATH,3.4
3,5,,,CS,4.2


In [60]:
pd.merge(stu_df, major_df, on='학번', how='outer')

Unnamed: 0,학번,이름,학년,학과,학점
0,1,섭섭해,2.0,CS,2.4
1,2,김한준,4.0,MATH,4.5
2,3,김선림,1.0,,
3,4,최호진,3.0,MATH,3.4
4,5,,,CS,4.2


In [61]:
data1 = {
    '학번' : [1,2,3,4],
    '이름' : ['섭섭해', '김한준', '김선림', '최호진'],
    '학년' : [2,4,1,3]
}

data2 = {
    '과목코드' : [1, 2, 4, 5],
    '학과' : ['CS', 'MATH', 'MATH', 'CS'],
    '학점' : [2.4, 4.5, 3.4, 4.2]
}

In [62]:
stu_df   = pd.DataFrame(data1)
major_df = pd.DataFrame(data2)
display(stu_df)
display(major_df)

Unnamed: 0,학번,이름,학년
0,1,섭섭해,2
1,2,김한준,4
2,3,김선림,1
3,4,최호진,3


Unnamed: 0,과목코드,학과,학점
0,1,CS,2.4
1,2,MATH,4.5
2,4,MATH,3.4
3,5,CS,4.2


In [63]:
pd.merge(stu_df, major_df, left_on='학번', right_on='과목코드', how='inner')

Unnamed: 0,학번,이름,학년,과목코드,학과,학점
0,1,섭섭해,2,1,CS,2.4
1,2,김한준,4,2,MATH,4.5
2,4,최호진,3,4,MATH,3.4


In [70]:
iris_df01 = pd.DataFrame({
    '품종' : ['setosa', 'setosa', 'virginica', 'virginica'],
    '꽃잎길이' : [1.4, 1.3, 1.5, 1.3]})
iris_df01

Unnamed: 0,품종,꽃잎길이
0,setosa,1.4
1,setosa,1.3
2,virginica,1.5
3,virginica,1.3


In [71]:
iris_df02 = pd.DataFrame({
    '품종' : ['setosa', 'virginica', 'virginica', 'versicolor'],
    '꽃잎너비' : [0.4, 0.3, 0.5, 0.3]})
iris_df02

Unnamed: 0,품종,꽃잎너비
0,setosa,0.4
1,virginica,0.3
2,virginica,0.5
3,versicolor,0.3


In [72]:
pd.merge(iris_df01, iris_df02, )

Unnamed: 0,품종,꽃잎길이,꽃잎너비
0,setosa,1.4,0.4
1,setosa,1.3,0.4
2,virginica,1.5,0.3
3,virginica,1.5,0.5
4,virginica,1.3,0.3
5,virginica,1.3,0.5


In [73]:
iris_df01 = pd.DataFrame({
    '품종' : ['setosa', 'setosa', 'virginica', 'virginica'],
    '꽃잎너비' : [1.4, 1.3, 1.5, 1.3],
    '개화시기' : ['202012', '202010', '202009', '202010']})

iris_df02 = pd.DataFrame({
    '품종' : ['setosa', 'virginica', 'virginica', 'versicolor'],
    '꽃잎너비' : [0.4, 0.3, 0.5, 0.3]})

In [75]:
pd.merge(iris_df01, iris_df02, on='품종')

Unnamed: 0,품종,꽃잎너비_x,개화시기,꽃잎너비_y
0,setosa,1.4,202012,0.4
1,setosa,1.3,202010,0.4
2,virginica,1.5,202009,0.3
3,virginica,1.5,202009,0.5
4,virginica,1.3,202010,0.3
5,virginica,1.3,202010,0.5


- column index가 아닌 index를 기준으로 병합하기

In [77]:
pop_df01 = pd.DataFrame({
    'city' : ['seoul', 'seoul', 'seoul', 'busan', 'busan'],
    'year' : [2010, 2005, 2020, 2018, 2015],
    'pop'  : [1234523, 2323112, 3423543, 4535232, 5543214]
})
pop_df01

Unnamed: 0,city,year,pop
0,seoul,2010,1234523
1,seoul,2005,2323112
2,seoul,2020,3423543
3,busan,2018,4535232
4,busan,2015,5543214


In [81]:
pop_df02    = pd.DataFrame( np.arange(12).reshape((6,2)) , 
    index   = [['busan' , 'busan', 'seoul' , 'seoul' , 'seoul' , 'seoul'] , 
               [2010, 2005, 2020, 2018, 2015, 2010]] , 
    columns =  ['col01','col02']
)
pop_df02

Unnamed: 0,Unnamed: 1,col01,col02
busan,2010,0,1
busan,2005,2,3
seoul,2020,4,5
seoul,2018,6,7
seoul,2015,8,9
seoul,2010,10,11


In [83]:
pd.merge(pop_df01, pop_df02, left_on=['city','year'], right_index=True)

Unnamed: 0,city,year,pop,col01,col02
0,seoul,2010,1234523,10,11
2,seoul,2020,3423543,4,5


- the way using single index, not multi index

In [85]:
data1 = { "이름" : ["이지안","박동훈","이순신","강감찬"],
          "학년" : [2,4,1,3]}

data2 = { "학과" : ["CS","MATH","MATH","CS"],
          "학점" : [3.4,2.9,4.5,1.2]}

df01 = pd.DataFrame(data1, index=[1,2,3,4])
df02 = pd.DataFrame(data2, index=[1,2,4,5])
display(df01)
display(df02)

Unnamed: 0,이름,학년
1,이지안,2
2,박동훈,4
3,이순신,1
4,강감찬,3


Unnamed: 0,학과,학점
1,CS,3.4
2,MATH,2.9
4,MATH,4.5
5,CS,1.2


In [86]:
merge_df = pd.merge(df01, df02, right_index=True, left_index=True)
merge_df

Unnamed: 0,이름,학년,학과,학점
1,이지안,2,CS,3.4
2,박동훈,4,MATH,2.9
4,강감찬,3,MATH,4.5


In [89]:
# join
df01.join(df02, how='inner')

Unnamed: 0,이름,학년,학과,학점
1,이지안,2,CS,3.4
2,박동훈,4,MATH,2.9
4,강감찬,3,MATH,4.5


In [87]:
merge_df.iloc[2]

이름     강감찬
학년       3
학과    MATH
학점     4.5
Name: 4, dtype: object

In [88]:
merge_df.loc[2]

이름     박동훈
학년       4
학과    MATH
학점     2.9
Name: 2, dtype: object