## <strong> 8. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [1]:
!pip install pandas



In [2]:
import pandas as pd
import numpy as np

In [3]:
# 헬프 문서 출력
pd?

In [4]:
# 라이브러리 버전 확인
pd.__version__

'1.1.5'

### Pandas 객체: <strong> Series </strong>

In [5]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.25,0.5,0.75,1.0])
ser

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# Pandas 객체 속성: .values
vals = ser.values
vals

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [8]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.25, 0.5, 0.75, 1.0], 
                index=['a','b','c','d'])
print(ser)
print(ser['a'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


### <strong> Dictionary와 Series 객체 </strong>

In [32]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [33]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Pandas 객체:<strong> DataFrame </strong>

In [34]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995
}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [35]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
#파이썬에서 딕셔너리는 {}
states = pd.DataFrame({'population': population,
                      'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [36]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [37]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

In [38]:
# [+] DataFrame으로부터 Series 객체 접근하기
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [39]:
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [40]:
states.population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

In [41]:
states['population']
#states.population  .보다 [''] <-사용하는 것이 더 좋음

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

#### NumPy 배열로부터 DataFrame 객체 생성
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [42]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12,2)
arr

array([[9.95867797e-01, 4.56296810e-01],
       [5.16812204e-01, 4.99864526e-01],
       [5.85256252e-01, 6.97793181e-01],
       [3.48481858e-01, 5.02843642e-01],
       [5.60639517e-01, 9.61485466e-01],
       [3.97473050e-01, 7.78171131e-01],
       [1.78665782e-01, 7.59231493e-01],
       [8.70406502e-01, 8.00844794e-04],
       [4.62367852e-03, 1.21161840e-01],
       [6.45100431e-01, 1.81423532e-01],
       [2.42970739e-01, 2.84220783e-01],
       [6.98350705e-01, 5.68294107e-01]])

In [43]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, 
    columns=['money_fortune', 'love_fortune'], 
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)

df

Unnamed: 0,money_fortune,love_fortune
Jan,0.995868,0.456297
Feb,0.516812,0.499865
Mar,0.585256,0.697793
Apr,0.348482,0.502844
May,0.56064,0.961485
Jun,0.397473,0.778171
Jul,0.178666,0.759231
Aug,0.870407,0.000801
Sep,0.004624,0.121162
Oct,0.6451,0.181424


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [44]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print('a' in ser)   # [+] Key + in 키워드
print(0.25 in ser)  # [+] Value + in 키워드

print(ser.index)    # [+] 인덱스
print(ser.keys())   # [+] 키 집합
ser['e'] = 1.25     # [+] 값 추가
ser['a'] = 0.125    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


#### 배열 스타일 조작

In [45]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    0.125
b    0.500
c    0.750
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.125
e    1.250
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [46]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    0.125
b    0.500
c    0.750
dtype: float64
a    0.125
b    0.500
c    0.750
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [47]:
# 특정 Series 객체 접근
states['area']
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [48]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
states['density']=states['population']/states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


#### 인덱서: ```loc```, ```iloc```

In [26]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [27]:
# loc 인덱서
print(ser.loc[1])
print(ser.loc[1:3])

a
1    a
3    b
dtype: object


In [28]:
# iloc 인덱서
print(ser.iloc[1])
print(ser.iloc[1:3])

b
3    b
5    c
dtype: object


In [29]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

[[3.83325210e+07 4.23967000e+05 9.04139261e+01]
 [2.64481930e+07 6.95662000e+05 3.80187404e+01]
 [1.96511270e+07 1.41297000e+05 1.39076746e+02]
 [1.95528600e+07 1.70312000e+05 1.14806121e+02]
 [1.28821350e+07 1.49995000e+05 8.58837628e+01]] 

              California         Texas      New York       Florida  \
population  3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07   
area        4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05   
density     9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02   

                Illinois  
population  1.288214e+07  
area        1.499950e+05  
density     8.588376e+01   

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297 

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135


In [30]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [31]:
# 값 수정
states.iloc[0, 2] = 90
states

Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763
