## <strong> 8. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [1]:
!pip install pandas



In [2]:
import pandas as pd
import numpy as np

In [3]:
# 헬프 문서 출력
pd?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'pandas' from '/opt/anaconda3/lib/python3.11/site-packages/pandas/__init__.py'>
[0;31mFile:[0m        /opt/anaconda3/lib/python3.11/site-packages/pandas/__init__.py
[0;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.

In [4]:
# 라이브러리 버전 확인
pd.__version__

'2.1.4'

### Pandas 객체: <strong> Series </strong>

In [5]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.25,0.5,0.75,1.0])
ser

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# Pandas 객체 속성: .values
vals = ser.values
vals

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [8]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)
print(ser['a'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


### <strong> Dictionary와 Series 객체 </strong>

In [9]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
population['California':"New York"]

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Pandas 객체:<strong> DataFrame </strong>

In [11]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995
}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [12]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
states = pd.DataFrame({"population": "population", 'area':area})
states

Unnamed: 0,population,area
California,population,423967
Texas,population,695662
New York,population,141297
Florida,population,170312
Illinois,population,149995


In [13]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)
states.values

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


array([['population', 423967],
       ['population', 695662],
       ['population', 141297],
       ['population', 170312],
       ['population', 149995]], dtype=object)

In [14]:
# [+] DataFrame으로부터 Series 객체 접근하기
states['area']
# states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

#### NumPy 배열로부터 DataFrame 객체 생성
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [15]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12, 2)
arr

array([[0.24663302, 0.31780131],
       [0.46593501, 0.44163225],
       [0.14420699, 0.41770884],
       [0.68659895, 0.53829007],
       [0.08774287, 0.16305146],
       [0.17557122, 0.32387285],
       [0.67150313, 0.58470417],
       [0.78464211, 0.65036438],
       [0.16262022, 0.10721968],
       [0.43402941, 0.72212493],
       [0.99495632, 0.29395441],
       [0.79989413, 0.92804294]])

In [16]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, 
    columns=['money_fortune', 'love_fortune'], 
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
)
df

Unnamed: 0,money_fortune,love_fortune
Jan,0.246633,0.317801
Feb,0.465935,0.441632
Mar,0.144207,0.417709
Apr,0.686599,0.53829
May,0.087743,0.163051
Jun,0.175571,0.323873
Jul,0.671503,0.584704
Aug,0.784642,0.650364
Sep,0.16262,0.10722
Oct,0.434029,0.722125


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [17]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print('a' in ser)   # [+] Key + in 키워드
print(0.25 in ser)  # [+] Value + in 키워드
print(ser.index)    # [+] 인덱스
print(ser.keys())   # [+] 키 집합
ser['e'] = 1.25     # [+] 값 추가
ser['a'] = 0.125    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


#### 배열 스타일 조작

In [18]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    0.125
b    0.500
c    0.750
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.125
e    1.250
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [19]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    0.125
b    0.500
c    0.750
dtype: float64
a    0.125
b    0.500
c    0.750
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [20]:
# 특정 Series 객체 접근
states['area']
# states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [24]:
states['population'] = pd.to_numeric(states['population'], errors='coerce')

In [25]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
states['density'] = states['population'] / states['area']

states

Unnamed: 0,population,area,density
California,,423967,
Texas,,695662,
New York,,141297,
Florida,,170312,
Illinois,,149995,


#### 인덱서: ```loc```, ```iloc```

In [26]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [27]:
# loc 인덱서
print(ser.loc[1])
print(ser.loc[1:3])

a
1    a
3    b
dtype: object


In [28]:
# iloc 인덱서
print(ser.iloc[1])
print(ser.iloc[1:3])

b
3    b
5    c
dtype: object


In [29]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

[[    nan 423967.     nan]
 [    nan 695662.     nan]
 [    nan 141297.     nan]
 [    nan 170312.     nan]
 [    nan 149995.     nan]] 

            California     Texas  New York   Florida  Illinois
population         NaN       NaN       NaN       NaN       NaN
area          423967.0  695662.0  141297.0  170312.0  149995.0
density            NaN       NaN       NaN       NaN       NaN 

            population    area
California         NaN  423967
Texas              NaN  695662
New York           NaN  141297 

            population
California         NaN
Texas              NaN
New York           NaN
Florida            NaN
Illinois           NaN


In [30]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density


In [31]:
# 값 수정
states.iloc[0, 2] = 90
states

Unnamed: 0,population,area,density
California,,423967,90.0
Texas,,695662,
New York,,141297,
Florida,,170312,
Illinois,,149995,
