In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Pandas
1. pandas는 dict를 확장한 개념이다
> key & value pair
2. 2차원 테이블 형태를 주로 다룬다
3. 주로 컬럼 위주로 처리가 일어난다
4. pandas 데이터에는 항상 컬럼명과 인덱스(행 이름)가 붙는다
5. 데이터타입 종류
    - pd.DataFrame
    - pd.Series

In [5]:
# 1. Dictionary ~~ Pandas
d = {'이름':'홍길동','나이':22,'지역':'서울','부서':'인사부'}
d['지역']

'서울'

In [4]:
# 1-1List ~~ Numpy
l = ['홍길동',22,'서울','인사부']
l[2]

'서울'

In [7]:
# 4. 열, 행 표시
df = pd.DataFrame({'이름':['홍길동','장길산'],'나이':[22,33],'지역':['서울','부산'],'부서':['인사부','개발부']})
df

Unnamed: 0,나이,부서,이름,지역
0,22,인사부,홍길동,서울
1,33,개발부,장길산,부산


#### 열, 행 표시
Pandas는 [열 이름][행 번호]으로 찾으니까 열의 순서는 그닥 중요하지 않음  
Numpy에서는 [행 번호, 열 번호] 즉, Index로 값을 찾기 때문에 순서가 중요함

In [11]:
df['이름'][0]

'홍길동'

In [12]:
df['나이'][1]

33

In [16]:
df2=pd.DataFrame([['홍길동',22,'서울','인사부'],['장길산',33,'부산','개발부']], 
             columns=['이름','나이','지역','부서'], index=[10011,10025])
df2

Unnamed: 0,이름,나이,지역,부서
10011,홍길동,22,서울,인사부
10025,장길산,33,부산,개발부


In [17]:
df2['부서'][10025]

'개발부'

In [22]:
df3 = pd.DataFrame(np.arange(12).reshape(4,3), columns=['a','b','c'])
df3

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [24]:
df3['c'] # 컬럼이 2인 열을 가져옴

0     2
1     5
2     8
3    11
Name: c, dtype: int32

In [25]:
df3.values

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

- Numpy 식으로 인덱싱/슬라이싱 => d.iloc[행번호,열번호]

In [26]:
d = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['col1','col2','col3'])
d

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6
2,7,8,9


In [27]:
d.iloc[1,2], d.iloc[2,1]

(6, 8)

In [28]:
d.iloc[0]

col1    1
col2    2
col3    3
Name: 0, dtype: int64

- 컬럼 하나를 뽑아내면 Series가 된다
- 컬럼명은 Series의 name이 된다

In [31]:
d = pd.DataFrame([['hong',22,'seoul'], ['jang',33,'daejeon'], ['dooley',11,'etc']],
                columns=['name','age','address'])
d

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon
2,dooley,11,etc


In [30]:
d.shape

(3, 3)

In [37]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
0    3 non-null object
1    3 non-null int64
2    3 non-null object
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


In [34]:
d.describe()

Unnamed: 0,age
count,3.0
mean,22.0
std,11.0
min,11.0
25%,16.5
50%,22.0
75%,27.5
max,33.0


In [41]:
d=pd.DataFrame([['hong',22,'Seoul'],['Don',33,'Busan'],['Dully',5,'Univers']])
d

Unnamed: 0,0,1,2
0,hong,22,Seoul
1,Don,33,Busan
2,Dully,5,Univers


In [43]:
d.index=list('abc')
d

Unnamed: 0,0,1,2
a,hong,22,Seoul
b,Don,33,Busan
c,Dully,5,Univers


In [38]:
d=pd.DataFrame([[0,None,2],[3,4,5]])
d

Unnamed: 0,0,1,2
0,0,,2
1,3,4.0,5


In [39]:
d[2][1]=None
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
0,0,,2.0
1,3,4.0,


In [40]:
d.isna()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,True


#### set_index(), reset_index()

In [45]:
data = pd.DataFrame([['hong',22,'seoul','hong@naver.com'],
                  ['jang',33,'incheon','jang@gmail.com'],
                  ['dooley',11,'etc','dooley@andromeda.uni']],
                columns=['name','age','address','email'])
data

Unnamed: 0,name,age,address,email
0,hong,22,seoul,hong@naver.com
1,jang,33,incheon,jang@gmail.com
2,dooley,11,etc,dooley@andromeda.uni


In [46]:
data2 = data.set_index('name')
data2

Unnamed: 0_level_0,age,address,email
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hong,22,seoul,hong@naver.com
jang,33,incheon,jang@gmail.com
dooley,11,etc,dooley@andromeda.uni


In [47]:
data2.reset_index()

Unnamed: 0,name,age,address,email
0,hong,22,seoul,hong@naver.com
1,jang,33,incheon,jang@gmail.com
2,dooley,11,etc,dooley@andromeda.uni


In [48]:
a = pd.DataFrame(np.arange(9).reshape(3,3))
b = pd.DataFrame(np.ones([3,3]))
display(a,b)

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


Unnamed: 0,0,1,2
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [49]:
a.index=[0,1,99]
a

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
99,6,7,8


In [50]:
a+b

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,,,
99,,,
