In [1]:
import pandas as pd

# Pandas
- 높은 수준의 구조형 데이터 클래스와 데이터 분석 도구를 제공
- 표와 같은 스프레드시트 구조로 데이터를 다룰 수 있는 기능
- 색인을 사용하여 손쉽게 값에 접근하고 정렬하는 기능
- 누락된 데이터를 유연하게 처리할 수 있는 기능
- 데이터를 합치고 관계 연산을 수행하는 기능

<div class="row">
  <div class="column">
    <img src="https://storage.googleapis.com/lds-media/images/series-and-dataframe.original.png" alt="Snow" style="width:100%">
  </div>
  <div class="column">
    <img src="https://bookdata.readthedocs.io/en/latest/_images/base_01_pandas_5_0.png" alt="Forest" style="width:100%">
  </div>
</div>

### Series 생성

In [6]:
series = pd.Series([3,5,7,9], index=['a','b','c','d'])

In [7]:
series

a    3
b    5
c    7
d    9
dtype: int64

In [10]:
series.values

array([3, 5, 7, 9])

In [11]:
series.index

Index(['a', 'b', 'c', 'd'], dtype='object')

### dictionary로부터 Series 생성

In [12]:
data = {'a':3, 'b':5, 'c':7, 'd':9}
series = pd.Series(data)

In [13]:
series

a    3
b    5
c    7
d    9
dtype: int64

### indexing, slicing and operation

In [14]:
series['a']

3

In [15]:
series[:2]

a    3
b    5
dtype: int64

In [16]:
series * 2 #broadcasting

a     6
b    10
c    14
d    18
dtype: int64

## DataFrame

### DataFrame 생성

In [23]:
data = {"state" : ['Ohio', 'Ohio', 'Ohio', 'Nevada','Nevada'],
        "year" : [2002, 2001, 2002, 2001, 2002],
        "pop" : [1.5, 1.7, 3.6, 2.4, 2.9]}

In [24]:
frame = pd.DataFrame(data=data, columns=["year", "state", "pop"], index=['a', 'b', 'c', 'd', 'e'])

In [33]:
frame

Unnamed: 0,year,state,pop
a,2002,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9


### column 추가

In [102]:
val = pd.Series([-1.2, 1.5, -1.7], index=['b', 'c', 'e'])
frame['debt'] = val

In [103]:
frame

Unnamed: 0,year,state,pop,debt
a,2002,Ohio,1.5,
b,2001,Ohio,1.7,-1.2
c,2002,Ohio,3.6,1.5
d,2001,Nevada,2.4,
e,2002,Nevada,2.9,-1.7


### missing value(NA) 처리

In [104]:
frame.dropna()

Unnamed: 0,year,state,pop,debt
b,2001,Ohio,1.7,-1.2
c,2002,Ohio,3.6,1.5
e,2002,Nevada,2.9,-1.7


In [107]:
frame.fillna(0.5)

Unnamed: 0,year,state,pop,debt
a,2002,Ohio,1.5,0.5
b,2001,Ohio,1.7,-1.2
c,2002,Ohio,3.6,1.5
d,2001,Nevada,2.4,0.5
e,2002,Nevada,2.9,-1.7


### columns or rows 제거

In [52]:
frame.drop('a', axis=0)

Unnamed: 0,year,state,pop,debt
b,2001,Ohio,1.7,-1.2
c,2002,Ohio,3.6,1.5
d,2001,Nevada,2.4,
e,2002,Nevada,2.9,-1.7


In [53]:
frame.drop('debt', axis=1)

Unnamed: 0,year,state,pop
a,2002,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6
d,2001,Nevada,2.4
e,2002,Nevada,2.9


In [54]:
frame = frame.drop('debt', axis=1) # assign to DataFrame object

### indexing, slicing and operation

### (1) access data columns

In [77]:
frame["year"] # access data with column name 

a    2002
b    2001
c    2002
d    2001
e    2002
Name: year, dtype: int64

In [78]:
print(type(frame["year"]))

<class 'pandas.core.series.Series'>


In [79]:
frame[['year', 'state']]

Unnamed: 0,year,state
a,2002,Ohio
b,2001,Ohio
c,2002,Ohio
d,2001,Nevada
e,2002,Nevada


In [80]:
print(type(frame[['year', 'state']]))

<class 'pandas.core.frame.DataFrame'>


### (2) access data rows

In [81]:
frame.iloc[0] # access data with row index

year     2002
state    Ohio
pop       1.5
Name: a, dtype: object

In [82]:
frame.iloc[0:3] # slice data with index

Unnamed: 0,year,state,pop
a,2002,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6


In [99]:
frame.loc['a':'c']

Unnamed: 0,year,state,pop
a,2002,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,3.6


### slice data with column and index

In [67]:
frame.iloc[0:2, 0:3] # slice data with column and index

Unnamed: 0,year,state,pop
a,2002,Ohio,1.5
b,2001,Ohio,1.7


In [98]:
frame[["state", "year"]].loc['a':'c']

Unnamed: 0,state,year
a,Ohio,2002
b,Ohio,2001
c,Ohio,2002
