# Pandas
- 분석하려는 데이터는 Series 또는 DataFrame(표형식의 테이블)
- Series -> numpy의 1차원 배열과 비슷(각 데이터의 의미를 표시하는 인덱스가 존재)
- Series : index + value

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import warnings
warnings.filterwarnings(action='ignore')

print('numpy version : ', np.__version__)
print('pandas version : ', np.__version__)

numpy version :  1.19.2
pandas version :  1.19.2


In [4]:
def seriesInfo(ary) : 
    print('data \n', ary)
    print('index : ', type(ary.index), ary.index)
    print('value : ', type(ary.values), ary.values)

In [5]:
ary = pd.Series([1,2,3,4,5, 'jskim'], dtype=np.object)
seriesInfo(ary)

data 
 0        1
1        2
2        3
3        4
4        5
5    jskim
dtype: object
index :  <class 'pandas.core.indexes.range.RangeIndex'> RangeIndex(start=0, stop=6, step=1)
value :  <class 'numpy.ndarray'> [1 2 3 4 5 'jskim']


In [6]:
ary = pd.Series({'a':1, 'b':2, 'c':3})
seriesInfo(ary)

data 
 a    1
b    2
c    3
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c'], dtype='object')
value :  <class 'numpy.ndarray'> [1 2 3]


- 인덱스의 라벨은 정수, 문자, 날짜, 시간

In [7]:
ary = pd.Series([1,2,3,4,5],
               index = ['강남', '서초', '방배', '동작', '신도림'])
seriesInfo(ary)

data 
 강남     1
서초     2
방배     3
동작     4
신도림    5
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['강남', '서초', '방배', '동작', '신도림'], dtype='object')
value :  <class 'numpy.ndarray'> [1 2 3 4 5]


- name, index.name 속성이 존재한다.
- name -> 시리즈 데이터에 이름을 붙여주는 역할
- index.name -> 시리즈 인덱스에 이름을 붙여주는 역할

In [9]:
ary.name = '순서'
ary.index.name = '역 구분'
seriesInfo(ary)

data 
 역 구분
강남     1
서초     2
방배     3
동작     4
신도림    5
Name: 순서, dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['강남', '서초', '방배', '동작', '신도림'], dtype='object', name='역 구분')
value :  <class 'numpy.ndarray'> [1 2 3 4 5]


In [11]:
# tuple 타입을 이용한 시리즈 객체 생성
tup_data = ('김지수', '2021-08-31', '남', False)
ary = pd.Series(tup_data, dtype=np.object, index = ['이름', '생년월일','성별','결혼여부'])
seriesInfo(ary)

data 
 이름             김지수
생년월일    2021-08-31
성별               남
결혼여부         False
dtype: object
index :  <class 'pandas.core.indexes.base.Index'> Index(['이름', '생년월일', '성별', '결혼여부'], dtype='object')
value :  <class 'numpy.ndarray'> ['김지수' '2021-08-31' '남' False]


**series에서 원소를 선택하기 위해서는?**
- 정수형 위치 인덱스를 활용하는 방법
- 인덱스의 이름을 활용하는 방법

**여러 원소를 선택하기**
- 리스트를 활용하면 된다.

In [29]:
print('정수 인덱스 : ', ary[0], type(ary[0]))
print('이름 인덱스 : ', ary['이름'], type(ary['이름']))

print()
print('정수 인덱스 : \n')
seriesInfo(ary[[0,1]])

print()
print('인덱스 이름 : \n')
seriesInfo(ary[['이름','성별']])

print()
print('범위 지정 : \n')
seriesInfo(ary[0:3])

정수 인덱스 :  김지수 <class 'str'>
이름 인덱스 :  김지수 <class 'str'>

정수 인덱스 : 

data 
 이름             김지수
생년월일    2021-08-31
dtype: object
index :  <class 'pandas.core.indexes.base.Index'> Index(['이름', '생년월일'], dtype='object')
value :  <class 'numpy.ndarray'> ['김지수' '2021-08-31']

인덱스 이름 : 

data 
 이름    김지수
성별      남
dtype: object
index :  <class 'pandas.core.indexes.base.Index'> Index(['이름', '성별'], dtype='object')
value :  <class 'numpy.ndarray'> ['김지수' '남']

범위 지정 : 

data 
 이름             김지수
생년월일    2021-08-31
성별               남
dtype: object
index :  <class 'pandas.core.indexes.base.Index'> Index(['이름', '생년월일', '성별'], dtype='object')
value :  <class 'numpy.ndarray'> ['김지수' '2021-08-31' '남']


In [17]:
for idx, value in ary.items() :
    print('idx : {}, value : {}'.format(idx, value))
print()

for idx in ary.keys() : 
    print('idx : {}'.format(idx))
print()

for value in ary.values : 
    print('value : {}'.format(value))

idx : 이름, value : 김지수
idx : 생년월일, value : 2021-08-31
idx : 성별, value : 남
idx : 결혼여부, value : False

idx : 이름
idx : 생년월일
idx : 성별
idx : 결혼여부

value : 김지수
value : 2021-08-31
value : 남
value : False


In [30]:
ary.index

Index(['이름', '생년월일', '성별', '결혼여부'], dtype='object')

In [31]:
ary.values

array(['김지수', '2021-08-31', '남', False], dtype=object)

In [32]:
ary = pd.Series(range(10, 21))
seriesInfo(ary)

data 
 0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
dtype: int64
index :  <class 'pandas.core.indexes.range.RangeIndex'> RangeIndex(start=0, stop=11, step=1)
value :  <class 'numpy.ndarray'> [10 11 12 13 14 15 16 17 18 19 20]


In [33]:
# 벡터화 연산도 가능하다
seriesInfo(ary*10)

data 
 0     100
1     110
2     120
3     130
4     140
5     150
6     160
7     170
8     180
9     190
10    200
dtype: int64
index :  <class 'pandas.core.indexes.range.RangeIndex'> RangeIndex(start=0, stop=11, step=1)
value :  <class 'numpy.ndarray'> [100 110 120 130 140 150 160 170 180 190 200]


In [34]:
# 2의 배수인 것만 출력한다면?
# boolean indexing : ary%2 == 0
ary[ary%2 == 0]

0     10
2     12
4     14
6     16
8     18
10    20
dtype: int64

In [4]:
from datetime import date, datetime, timedelta
from dateutil.parser import parse

day = datetime(2021, 8, 31)
print(day+timedelta(days=1))

2021-09-01 00:00:00


- 평균이 50이고 편차 5인 정규분포 데이터를 10일간 만들고 싶다면?

In [42]:
factory01 = pd.Series( [ x for x in np.random.normal(50, 5, (10, )) ],
                      index = [day+timedelta(days=d) for d in range(10)] )
# seriesInfo(factory01)
factory02 = pd.Series( [x for x in np.random.normal(70, 8, (10,))],
                        index = [day+timedelta(days=d) for d in range(10)])

In [43]:
factory01 + factory02

2021-08-31    128.928845
2021-09-01    100.277366
2021-09-02    116.817434
2021-09-03    109.683372
2021-09-04    124.986227
2021-09-05    106.910891
2021-09-06    131.598711
2021-09-07    106.656173
2021-09-08    107.205913
2021-09-09    127.156566
dtype: float64

In [44]:
factory01.index

DatetimeIndex(['2021-08-31', '2021-09-01', '2021-09-02', '2021-09-03',
               '2021-09-04', '2021-09-05', '2021-09-06', '2021-09-07',
               '2021-09-08', '2021-09-09'],
              dtype='datetime64[ns]', freq=None)

In [45]:
factory02.index

DatetimeIndex(['2021-08-31', '2021-09-01', '2021-09-02', '2021-09-03',
               '2021-09-04', '2021-09-05', '2021-09-06', '2021-09-07',
               '2021-09-08', '2021-09-09'],
              dtype='datetime64[ns]', freq=None)

In [48]:
factory01 = pd.Series( [ x for x in np.random.normal(50, 5, (10,))],
                     index = [ day + timedelta(days = d) for d in range(10)])
# seriesInfo(factory01)
temp_day = datetime(2020, 8, 31)
factory02 = pd.Series( [ x for x in np.random.normal(70, 8, (10,))],
                     index = [ day + timedelta(days = d+1) for d in range(10)])

In [49]:
factory01 + factory02

2021-08-31           NaN
2021-09-01    126.760791
2021-09-02    113.088301
2021-09-03    113.807900
2021-09-04    112.195322
2021-09-05    120.482576
2021-09-06    129.070008
2021-09-07    119.157795
2021-09-08    110.016016
2021-09-09    133.267766
2021-09-10           NaN
dtype: float64

- 데이터 갱신, 추가, 삭제

In [51]:
price_series = pd.Series([4000, 3000, 3500, 2000],
                        index = ['a','b', 'c', 'd'])
seriesInfo(price_series)

data 
 a    4000
b    3000
c    3500
d    2000
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd'], dtype='object')
value :  <class 'numpy.ndarray'> [4000 3000 3500 2000]


In [52]:
# 문자인덱스를 이용한 데이터 갱신
price_series['a'] = 5000
seriesInfo(price_series)

data 
 a    5000
b    3000
c    3500
d    2000
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd'], dtype='object')
value :  <class 'numpy.ndarray'> [5000 3000 3500 2000]


In [53]:
# 인덱스를 이용한 데이터 갱신
price_series[0] = 6000
seriesInfo(price_series)

data 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd'], dtype='object')
value :  <class 'numpy.ndarray'> [6000 3000 3500 2000]


In [54]:
# 문자인덱스를 이용한 데이터 추가
price_series['e'] = 7000
seriesInfo(price_series)

data 
 a    6000
b    3000
c    3500
d    2000
e    7000
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
value :  <class 'numpy.ndarray'> [6000 3000 3500 2000 7000]


In [55]:
# 삭제
del price_series['e']
seriesInfo(price_series)

data 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd'], dtype='object')
value :  <class 'numpy.ndarray'> [6000 3000 3500 2000]


In [56]:
price_series['e'] = np.NaN
seriesInfo(price_series)

data 
 a    6000.0
b    3000.0
c    3500.0
d    2000.0
e       NaN
dtype: float64
index :  <class 'pandas.core.indexes.base.Index'> Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
value :  <class 'numpy.ndarray'> [6000. 3000. 3500. 2000.   nan]


In [59]:
# null 찾기
print('isnull : ')
print(pd.isnull(price_series))
print()
# null만 찾아 표시
print(price_series[price_series.isnull()])
print()
# null이 아닌 값 찾고 표시
print('notnull : ')
print(pd.notnull(price_series))
print()
print(price_series[price_series.notnull()])

isnull : 
a    False
b    False
c    False
d    False
e     True
dtype: bool

e   NaN
dtype: float64

notnull : 
a     True
b     True
c     True
d     True
e    False
dtype: bool

a    6000.0
b    3000.0
c    3500.0
d    2000.0
dtype: float64


## DataFrame

- pd.Dataframe()

In [60]:
def frmInfo(df) :
    display('DataFrame : ', df)
    print('shape   : ', df.shape)
    print('size    : ', df.size)
    print('ndim    : ', df.ndim)
    print('index   : ', df.index, type(df.index))
    print('columns : ', df.columns, type(df.columns))
    print('values  : \n', df.values, type(df.values))

In [61]:
# 딕셔너리를 이용한 생성
dict_data = {
    'col01' : [1,2,3],
    'col02' : [4,5,6],
    'col03' : [7,8,9],
    'col04' : [10,11,12],
    'col05' : [13,14,15]
}
temp_df = pd.DataFrame(dict_data)
frmInfo(temp_df)

'DataFrame : '

Unnamed: 0,col01,col02,col03,col04,col05
0,1,4,7,10,13
1,2,5,8,11,14
2,3,6,9,12,15


shape   :  (3, 5)
size    :  15
ndim    :  2
index   :  RangeIndex(start=0, stop=3, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
columns :  Index(['col01', 'col02', 'col03', 'col04', 'col05'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  : 
 [[ 1  4  7 10 13]
 [ 2  5  8 11 14]
 [ 3  6  9 12 15]] <class 'numpy.ndarray'>


In [64]:
# 리스트를 이용한 생성
print('리스트를 이용한 생성 : ')
temp_df = pd.DataFrame([[20,'M',False], [30, 'F', True]],
                      index = ['김지수', '제시'],
                      columns = ['나이', '성별', '결혼여부'])
frmInfo(temp_df)

리스트를 이용한 생성 : 


'DataFrame : '

Unnamed: 0,나이,성별,결혼여부
김지수,20,M,False
제시,30,F,True


shape   :  (2, 3)
size    :  6
ndim    :  2
index   :  Index(['김지수', '제시'], dtype='object') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['나이', '성별', '결혼여부'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  : 
 [[20 'M' False]
 [30 'F' True]] <class 'numpy.ndarray'>


In [65]:
print('rename 함수를 이용한 열 이름 변경 : ')
print('원본에 반영하려면 inplace : ')

temp_df.rename(columns = {'나이' : 'age', '성별' : 'sex', '결혼여부' : 'marriage'}, inplace = True)
temp_df.rename(index = {'김지수' : 'jesse kim', '제시' : 'jessie'}, inplace = True)

print()
frmInfo(temp_df)

rename 함수를 이용한 열 이름 변경 : 
원본에 반영하려면 inplace : 



'DataFrame : '

Unnamed: 0,age,sex,marriage
jesse kim,20,M,False
jessie,30,F,True


shape   :  (2, 3)
size    :  6
ndim    :  2
index   :  Index(['jesse kim', 'jessie'], dtype='object') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['age', 'sex', 'marriage'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  : 
 [[20 'M' False]
 [30 'F' True]] <class 'numpy.ndarray'>


In [67]:
data = {
    "2021" : [9910293, 8384050, 2938485, 1203948],
    "2018" : [8910293, 7384050, 5938485, 3203948],
    "2016" : [7910293, 5384050, 7938485, 6203948],
    "2014" : [5910293, 3384050, 4938485, 4203948],
    "지역" : ['수도권' , '경상권' , '수도권' , '경상권'],
    "증가율" : [0.2343 , 0.0434 , 0.0944 , 0.0034]
}

columns = ['지역', '2014', '2016', '2018', '2021', '증가율']

pop_frm = pd.DataFrame(data,
                      columns = columns,
                      index = ['서울', '부산', '경기', '대구'])
frmInfo(pop_frm)

'DataFrame : '

Unnamed: 0,지역,2014,2016,2018,2021,증가율
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


shape   :  (4, 6)
size    :  24
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 5910293 7910293 8910293 9910293 0.2343]
 ['경상권' 3384050 5384050 7384050 8384050 0.0434]
 ['수도권' 4938485 7938485 5938485 2938485 0.0944]
 ['경상권' 4203948 6203948 3203948 1203948 0.0034]] <class 'numpy.ndarray'>


In [68]:
pop_frm.index.name = 'city'
pop_frm.columns.name = 'feature'
frmInfo(pop_frm)
print()
print('index name : ', pop_frm.index, pop_frm.index.name)
print('columns name : ', pop_frm.columns, pop_frm.columns.name)

'DataFrame : '

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


shape   :  (4, 6)
size    :  24
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object', name='feature') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 5910293 7910293 8910293 9910293 0.2343]
 ['경상권' 3384050 5384050 7384050 8384050 0.0434]
 ['수도권' 4938485 7938485 5938485 2938485 0.0944]
 ['경상권' 4203948 6203948 3203948 1203948 0.0034]] <class 'numpy.ndarray'>

index name :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') city
columns name :  Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object', name='feature') feature


- 열 데이터의 갱신, 추가, 삭제

In [69]:
# 새로운 피처 추가
pop_frm['2014-2016 증가율'] = ((pop_frm['2016'] - pop_frm['2014']) / pop_frm['2014']*100).round(2)
frmInfo(pop_frm)

'DataFrame : '

feature,지역,2014,2016,2018,2021,증가율,2014-2016 증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343,33.84
부산,경상권,3384050,5384050,7384050,8384050,0.0434,59.1
경기,수도권,4938485,7938485,5938485,2938485,0.0944,60.75
대구,경상권,4203948,6203948,3203948,1203948,0.0034,47.57


shape   :  (4, 7)
size    :  28
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '2014', '2016', '2018', '2021', '증가율', '2014-2016 증가율'], dtype='object', name='feature') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 5910293 7910293 8910293 9910293 0.2343 33.84]
 ['경상권' 3384050 5384050 7384050 8384050 0.0434 59.1]
 ['수도권' 4938485 7938485 5938485 2938485 0.0944 60.75]
 ['경상권' 4203948 6203948 3203948 1203948 0.0034 47.57]] <class 'numpy.ndarray'>


In [70]:
# 피처삭제
del pop_frm['2014-2016 증가율']
frmInfo(pop_frm)

'DataFrame : '

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


shape   :  (4, 6)
size    :  24
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object', name='feature') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 5910293 7910293 8910293 9910293 0.2343]
 ['경상권' 3384050 5384050 7384050 8384050 0.0434]
 ['수도권' 4938485 7938485 5938485 2938485 0.0944]
 ['경상권' 4203948 6203948 3203948 1203948 0.0034]] <class 'numpy.ndarray'>


In [71]:
frmInfo(pop_frm[['지역', '증가율']])

'DataFrame : '

feature,지역,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,수도권,0.2343
부산,경상권,0.0434
경기,수도권,0.0944
대구,경상권,0.0034


shape   :  (4, 2)
size    :  8
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '증가율'], dtype='object', name='feature') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 0.2343]
 ['경상권' 0.0434]
 ['수도권' 0.0944]
 ['경상권' 0.0034]] <class 'numpy.ndarray'>


- row indexing(항상 슬라이싱해야 한다)
- 배열, 라벨 인덱싱이 가능하다.

In [72]:
frmInfo(pop_frm)

'DataFrame : '

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


shape   :  (4, 6)
size    :  24
ndim    :  2
index   :  Index(['서울', '부산', '경기', '대구'], dtype='object', name='city') <class 'pandas.core.indexes.base.Index'>
columns :  Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object', name='feature') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['수도권' 5910293 7910293 8910293 9910293 0.2343]
 ['경상권' 3384050 5384050 7384050 8384050 0.0434]
 ['수도권' 4938485 7938485 5938485 2938485 0.0944]
 ['경상권' 4203948 6203948 3203948 1203948 0.0034]] <class 'numpy.ndarray'>


In [73]:
display(pop_frm[ : 1])
display(pop_frm[ : '서울'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


In [75]:
display(pop_frm[0:3])
display(pop_frm['서울':'경기'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


- 개별 인덱싱 (특정행, 특정열)

In [76]:
# frmInfo(pop_frm)
pop_frm['2021'][['서울', '대구']]

city
서울    9910293
대구    1203948
Name: 2021, dtype: int64

**아래 url을 이용한 데이터프레임 생성**
- https://www.kobis.or.kr/kobisopenapi/homepg/apiservice/searchServiceInfo.do?serviceId=searchDailyBoxOffice

In [77]:
import json
import urllib

movie_url     = 'http://kobis.or.kr/kobisopenapi/webservice/rest/boxoffice/searchDailyBoxOfficeList.json?key=f5eef3421c602c6cb7ea224104795888&targetDt=20120101'
response_page = urllib.request.urlopen(movie_url)
print('page - ')
print(page)

json_page = json.loads(response_page.read())
print('json - ' , type(json_page))

print()
# print(json_page)

data = json_page['boxOfficeResult']['dailyBoxOfficeList']
# print('type - '   , type(data))
# print('data - \n' , data)
# rank, movieNm , salesAmt 담는 데이터 프레임을 만든다면?

rank_list  = []
title_list = []
sales_list = []
for tmp_dict in data : 
    rank_list.append(tmp_dict['rank'])
    title_list.append(tmp_dict['movieNm'])
    sales_list.append(tmp_dict['salesAmt'])

movie_data = {
    'rank'  : rank_list  , 
    'title' : title_list ,
    'amt'   : sales_list
}

movie_frm = pd.DataFrame(movie_data)
frmInfo(movie_frm)

# movie_frm = pd.DataFrame(data)
# frmInfo(movie_frm[['rank' , 'movieNm' , 'salesAmt']])


page - 
<http.client.HTTPResponse object at 0x0000021370C74D90>
json -  <class 'dict'>



'DataFrame : '

Unnamed: 0,rank,title,amt
0,1,미션임파서블:고스트프로토콜,2776060500
1,2,마이 웨이,1189058500
2,3,셜록홈즈 : 그림자 게임,1176022500
3,4,퍼펙트 게임,644532000
4,5,프렌즈: 몬스터섬의비밀,436753500
5,6,라이온 킹,507115500
6,7,오싹한 연애,344871000
7,8,극장판 포켓몬스터 베스트 위시「비크티니와 백의 영웅 레시라무」,167809500
8,9,앨빈과 슈퍼밴드3,137030000
9,10,극장판 포켓몬스터 베스트 위시 「비크티니와 흑의 영웅 제크로무」,125535500


shape   :  (10, 3)
size    :  30
ndim    :  2
index   :  RangeIndex(start=0, stop=10, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
columns :  Index(['rank', 'title', 'amt'], dtype='object') <class 'pandas.core.indexes.base.Index'>
values  : 
 [['1' '미션임파서블:고스트프로토콜' '2776060500']
 ['2' '마이 웨이' '1189058500']
 ['3' '셜록홈즈 : 그림자 게임' '1176022500']
 ['4' '퍼펙트 게임' '644532000']
 ['5' '프렌즈: 몬스터섬의비밀 ' '436753500']
 ['6' '라이온 킹' '507115500']
 ['7' '오싹한 연애' '344871000']
 ['8' '극장판 포켓몬스터 베스트 위시「비크티니와 백의 영웅 레시라무」' '167809500']
 ['9' '앨빈과 슈퍼밴드3' '137030000']
 ['10' '극장판 포켓몬스터 베스트 위시 「비크티니와 흑의 영웅 제크로무」' '125535500']] <class 'numpy.ndarray'>


#### booklist_json.json 파일로부터 데이터를 읽어서 들여서 데이터프레임을 만들어 보세요[실습]

- json.loads() : json으로 된 문자열 파일을 읽을 때
- json.load() : json 파일을 읽을 때

In [None]:
with open('./data/booklist_json.json', 'r', encoding='utf-8') as file :
    books = json.load(file)

print('books type : ', type(books))
print(books)
print()
book_frm = pd.DataFrame(books)
display(book_frm)

#### 다음 조건을 만족하는 임의의 데이터프레임을 만들어 보자[실습]

- 열의 갯수와 행의 갯수가 각각 5개 이상이여야 한다.
- 열에는 정수, 문자열, 실수, 날짜 데이터가 각각 1개 이상 포함되어야 한다.

In [2]:
intValue = np.random.randint(1, 100, 10)
print(type(intValue), intValue)

<class 'numpy.ndarray'> [85 73 30 25 34 20 59  9 34 60]


In [3]:
floatValue = np.random.randn(10)
print(type(floatValue), floatValue)

<class 'numpy.ndarray'> [-1.01192288  0.33789516 -1.31667387 -0.49539034  0.81772226  1.05715894
 -0.03002844 -1.67294352  0.53757403 -0.00300527]


In [5]:
strDay = datetime(1021, 9, 1)
years = [strDay + timedelta(day) for day in range(0,10)]
years

[datetime.datetime(1021, 9, 1, 0, 0),
 datetime.datetime(1021, 9, 2, 0, 0),
 datetime.datetime(1021, 9, 3, 0, 0),
 datetime.datetime(1021, 9, 4, 0, 0),
 datetime.datetime(1021, 9, 5, 0, 0),
 datetime.datetime(1021, 9, 6, 0, 0),
 datetime.datetime(1021, 9, 7, 0, 0),
 datetime.datetime(1021, 9, 8, 0, 0),
 datetime.datetime(1021, 9, 9, 0, 0),
 datetime.datetime(1021, 9, 10, 0, 0)]

In [9]:
strValue = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
strValue

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [10]:
data = {
    '정수' : intValue,
    '실수' : floatValue,
    '날짜' : years,
    '문자' : strValue
}
test_frm = pd.DataFrame(data)
display(test_frm)

Unnamed: 0,정수,실수,날짜,문자
0,85,-1.011923,1021-09-01 00:00:00,a
1,73,0.337895,1021-09-02 00:00:00,b
2,30,-1.316674,1021-09-03 00:00:00,c
3,25,-0.49539,1021-09-04 00:00:00,d
4,34,0.817722,1021-09-05 00:00:00,e
5,20,1.057159,1021-09-06 00:00:00,f
6,59,-0.030028,1021-09-07 00:00:00,g
7,9,-1.672944,1021-09-08 00:00:00,h
8,34,0.537574,1021-09-09 00:00:00,i
9,60,-0.003005,1021-09-10 00:00:00,j
