## <strong> 6. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [1]:
!pip install pandas



In [2]:
import pandas as pd
import numpy as np

In [3]:
# 헬프 문서 출력
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'c:\\Users\\jeong\\anaconda3\\Lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\jeong\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point 

In [4]:
# 라이브러리 버전 확인
pd.__version__

'2.2.2'

### Pandas 객체: <strong> Series </strong>

In [5]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0])
ser

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# Pandas 객체 속성: .values
vals = ser.values
vals

array([0.25, 0.5 , 0.75, 1.  ])

In [8]:
type(ser.values)

numpy.ndarray

In [9]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [5]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)
print(ser['a'])
ser['d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


1.0

### <strong> Dictionary와 Series 객체 </strong>

In [6]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [15]:
population_dict['Texas' : 'Florida']

KeyError: slice('Texas', 'Florida', None)

In [12]:
population['Florida']
#population[3]

19552860

In [None]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
population['California' : 'New York'] #stop 값도 포함함.
#population[: 2] #stop 값을 포함하지 않음

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Pandas 객체:<strong> DataFrame </strong>

In [19]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995
}

area = pd.Series(area_dict)
area 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [24]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
states = pd.DataFrame({'population' : population, 'area' : area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [26]:
#states.index
states.columns

Index(['population', 'area'], dtype='object')

In [27]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [28]:
# [+] DataFrame으로부터 Series 객체 접근하기
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

#### **NumPy 배열로부터 DataFrame 객체 생성**
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [79]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12, 2)
arr

array([[0.49286596, 0.56485825],
       [0.79122496, 0.69437703],
       [0.55837028, 0.66622999],
       [0.39066035, 0.48553511],
       [0.12963117, 0.26380896],
       [0.24253529, 0.52059146],
       [0.99448422, 0.19708195],
       [0.87165541, 0.75304895],
       [0.51266493, 0.37267628],
       [0.50354924, 0.91203702],
       [0.55337349, 0.38806934],
       [0.39400927, 0.52008905]])

In [80]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, #데이터 객체체
    columns=['money_fortune', 'love_fortune'], #컬럼 이름 지정정
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', #인덱스 이름 지정정
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)

df

Unnamed: 0,money_fortune,love_fortune
Jan,0.492866,0.564858
Feb,0.791225,0.694377
Mar,0.55837,0.66623
Apr,0.39066,0.485535
May,0.129631,0.263809
Jun,0.242535,0.520591
Jul,0.994484,0.197082
Aug,0.871655,0.753049
Sep,0.512665,0.372676
Oct,0.503549,0.912037


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [94]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print('a' in ser)   # [+] Key + in 키워드 (가능)
print(0.25 in ser)  # [+] Value + in 키워드 (불가)
print(ser.index)    # [+] 인덱스
print(ser.keys)   # [+] 키 집합
ser['e'] = 1.25     # [+] 값 추가
ser['a'] = 0.125    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
<bound method Series.keys of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


#### 배열 스타일 조작

In [95]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    0.125
b    0.500
c    0.750
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.125
e    1.250
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [96]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    0.125
b    0.500
c    0.750
dtype: float64
a    0.125
b    0.500
c    0.750
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [None]:
# 특정 Series 객체 접근
states['area'] #딕셔너리 스타일
states.area #객체.속성 접근 스타일

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [None]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
#states['density'] = population / area
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


#### 인덱서: ```loc```, ```iloc```

In [106]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [None]:
# loc 인덱서
print(ser.loc[1])
print(ser.loc[1:3])

In [None]:
# iloc 인덱서
print(ser.iloc[1])
print(ser.iloc[1:3])

In [None]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

In [None]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

In [None]:
# 값 수정
states.iloc[0, 2] = 90
states