In [4]:
import pandas as pd
import numpy as np

In [4]:
# series 는 생성자
# 리스트로 시리즈 생성
sr = pd.Series([1,2,3,4,5])
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
# 튜플로 시리즈 생성
sr = pd.Series((1,2,3,4,5))
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# 딕셔너리로 시리즈 생성
sr = pd.Series({'a':1,'b':2,'c':3,'d':4,'e':5})
sr

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
# numpy 배열로 시리즈 생성
import numpy as np
sr = pd.Series(np.array([1,2,3,4,5]))
sr

0    1
1    2
2    3
3    4
4    5
dtype: int32

In [11]:
# 기존 시리즈로 시리즈 생성
sr_01 = pd.Series([1,2,3,4,5])
sr_02 = pd.Series(sr_01)
sr_02

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [14]:
sr = pd.Series(data=[1,2,3,4,5], index=['a','b','c','d','e'])
sr

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [21]:
# 시리즈 이름 지정
sr_01 = pd.Series(data=[1,2,3,4,5], index=['a','b','c','d','e'], name='Group_01')
sr_02 = pd.Series(data=[6,7,8,9,10], index=['a','b','c','d','e'], name='Group_02')

df = pd.concat([sr_01,sr_02], axis=1)
df

Unnamed: 0,Group_01,Group_02
a,1,6
b,2,7
c,3,8
d,4,9
e,5,10


In [5]:
arr = np.array([1,2,3,4,5])
sr = pd.Series(arr, copy=False)

sr[0] = 999
sr[1] = 999

print(arr, '\n')
print(sr)

[999 999   3   4   5] 

0    999
1    999
2      3
3      4
4      5
dtype: int32


In [6]:
lst = [1,2,3,4,5]
sr = pd.Series(lst, copy=False)

sr[0] = 999
sr[1] = 999

print(lst, '\n')
print(sr)

[1, 2, 3, 4, 5] 

0    999
1    999
2      3
3      4
4      5
dtype: int64


# 함수

In [65]:
# 시리즈의 매개변수
sr = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
sr2 = pd.Series([np.nan,2,3,4,5], index=['a','b','c','d','e'])
sr.index
sr.values
sr.shape
sr.ndim
sr.dtype
sr.nbytes # 메모리 용량
sr2.hasnans  # nan  not a number
sr.empty

False

In [17]:
sr = pd.Series([10,20,30,40], index=['A','B','C','D'])
sr

A    10
B    20
C    30
D    40
dtype: int64

In [20]:
sr[0]
sr.iloc[0]

10

In [24]:
print(sr[[0,2,3]])
print(sr[0:2])
print(sr.iloc[2:])

A    10
C    30
D    40
dtype: int64
A    10
B    20
dtype: int64
C    30
D    40
dtype: int64


In [25]:
sr.loc['A']

10

In [26]:
sr.loc['A':'C']

A    10
B    20
C    30
dtype: int64

In [28]:
sr[sr > 20] # 조건도 가능함

C    30
D    40
dtype: int64

In [34]:
# 데이터 추가
data = [1,2,3,4,5]
idx = ['A', 'B','C','D','E']

sr = pd.Series(data, index=idx)
sr.loc['F'] = 6 # 추가
sr.loc['B'] = 6 # 변경
sr


A    1
B    6
C    3
D    4
E    5
F    6
dtype: int64

In [37]:
data = [1,2,3,4,5]
idx = ['A', 'B','C','D','E']

sr = pd.Series(data, index=idx)
sr.drop(['B', 'D'])


A    1
C    3
E    5
dtype: int64

In [19]:
# 구분자가 쉼표가 아니라 공백일 경우
df = pd.read_csv('for_test.csv', encoding='cp949', sep=' ')

In [20]:
df = pd.read_csv('for_test.csv', encoding='cp949')
df

Unnamed: 0,연도,동해,남해,서해,전체
0,1996,17.4629,17.2288,14.436,15.9067
1,1997,17.4116,17.4092,14.8248,16.1526
2,1998,17.5944,18.011,15.2512,16.6044
3,1999,18.1495,18.3175,14.8979,16.6284
4,2000,17.9288,18.1766,15.0504,16.6178


In [21]:
# 라벨 중 원하는 것을 인덱스로 넣기

df = pd.read_csv('for_test.csv', encoding='cp949', index_col='연도')
df

Unnamed: 0_level_0,동해,남해,서해,전체
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,17.4629,17.2288,14.436,15.9067
1997,17.4116,17.4092,14.8248,16.1526
1998,17.5944,18.011,15.2512,16.6044
1999,18.1495,18.3175,14.8979,16.6284
2000,17.9288,18.1766,15.0504,16.6178


In [22]:
df.loc[1997:1999]

Unnamed: 0_level_0,동해,남해,서해,전체
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997,17.4116,17.4092,14.8248,16.1526
1998,17.5944,18.011,15.2512,16.6044
1999,18.1495,18.3175,14.8979,16.6284


In [23]:
df.to_csv('for_saving_test.csv', encoding='cp949')

# 오후

In [38]:
# 리스트를 이용한 데이터 프레임 생성
data_list = [['John', 25, 'New York'],
        ['Emma', 28, 'Paris'],
        ['Peter', 30, 'London'],
        ['Lisa', 27, 'Sydney']]

In [39]:
df = pd.DataFrame(data_list, columns=['Name', 'Age', 'City'])
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [44]:
# 넘파이 배열을 이용한 데이터 프레임 생성
data_arr = np.array([['John', 25, 'New York'],
                    ['Emma', 28, 'Paris'],
                    ['Peter', 30, 'London'],
                    ['Lisa', 27, 'Sydney']])
type(data_arr)

numpy.ndarray

In [45]:
df = pd.DataFrame(data_arr, columns=['Name', 'Age', 'City'])
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [49]:
# 딕셔너리를 이용한 데이터 프레임 생성 / key 값이 열 이름
data_dict = {'Name':['John', 'Emma','Peter','Lisa'],
             'Age':[25,28,30,27],
             'City':['New York','Paris','London', 'Sydney']}


In [48]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [53]:
# 중첩 딕셔너리 바깥 key는 열 이름이 되고, 안쪽 key는 인덱스가 된다
data_double_dict = {'Name':{'a':'John', 'b':'Emma','c':'Peter','d':'Lisa'},
             'Age':{'a':25,'b':28,'c':30,'d':27},
             'City':{'a':'New York','b':'Paris','c':'London', 'd':'Sydney'}}

In [54]:
df = pd.DataFrame(data_double_dict)
df

Unnamed: 0,Name,Age,City
a,John,25,New York
b,Emma,28,Paris
c,Peter,30,London
d,Lisa,27,Sydney


In [57]:
name = pd.Series(['John', 'Emma','Peter','Lisa'])
age = pd.Series([25,28,30,27])
city = pd.Series(['New York','Paris','London', 'Sydney'])

In [58]:
df = pd.DataFrame({'Name':name, 'Age':age, 'City':city})
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [59]:
sr_01 = pd.Series([1,2,3,4,5], name='sr_01')
sr_02 = pd.Series(['a','b','c','d','e'], name='sr_02')

* 잘못된 병합 방법

In [62]:
# 잘못된 데이터 프레임 병합
df = pd.DataFrame([sr_01, sr_02])
df

Unnamed: 0,0,1,2,3,4
sr_01,1,2,3,4,5
sr_02,a,b,c,d,e


In [63]:
sr_01 = pd.Series([1,2,3], name='sr_01')
sr_02 = pd.Series([4,5,6], name='sr_02')
sr_03 = pd.Series([7,8,9], name='sr_03')

In [64]:
df = pd.concat([sr_01, sr_02, sr_03], axis=1)
df

Unnamed: 0,sr_01,sr_02,sr_03
0,1,4,7
1,2,5,8
2,3,6,9


In [70]:
data_dict = {'Name':['John', 'Emma','Peter','Lisa'],
             'Age':[25,28,30,27],
             'City':['New York','Paris','London', 'Sydney']}


In [71]:
# index 매개변수 사용
df = pd.DataFrame(data_dict, index=['A', 'B', 'C', 'D'])
df

Unnamed: 0,Name,Age,City
A,John,25,New York
B,Emma,28,Paris
C,Peter,30,London
D,Lisa,27,Sydney


In [74]:
df = pd.DataFrame(data_dict, index=['A', 'B', 'C', 'D'])
df.columns = ['이름','나이','도시']
df

Unnamed: 0,이름,나이,도시
A,John,25,New York
B,Emma,28,Paris
C,Peter,30,London
D,Lisa,27,Sydney


In [85]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.shape)
print(df.ndim)
print(df.values)
df.columns
df.index
df.size
df.dtypes # 자료형


(3, 2)
2
[[1 4]
 [2 5]
 [3 6]]


A    int64
B    int64
dtype: object

In [87]:
KTX_data = {'경부선 KTX': [39060, 39896, 42005, 43621, 41702, 41266, 32427],
            '호남선 KTX': [7313, 6967, 6873, 6626, 8675, 10622, 9228],
            '경전선 KTX': [3627, 4168, 4088, 4424, 4606, 4984, 5570],
            '전라선 KTX': [309, 1771, 1954, 2244, 3146, 3945, 5766],
            '동해선 KTX': [np.nan,np.nan, np.nan, np.nan, 2395, 3786, 6667]}

index_list = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']

In [114]:
df_ktx = pd.DataFrame(KTX_data, index=index_list)
df_ktx

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [90]:
df_ktx.columns

Index(['경부선 KTX', '호남선 KTX', '경전선 KTX', '전라선 KTX', '동해선 KTX'], dtype='object')

In [99]:
df_ktx.loc['2014':'2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0


In [103]:
df_ktx['경부선 KTX'][2:5]

2013    42005
2014    43621
2015    41702
Name: 경부선 KTX, dtype: int64

In [104]:
df_ktx['호남선 KTX']['2016']

10622

In [105]:
df_ktx['호남선 KTX'][5]

10622

In [106]:
df_ktx['호남선 KTX'].loc['2016']

10622

In [107]:
df_ktx.loc['2016']['호남선 KTX']

10622.0

In [108]:
df_ktx.T

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017
경부선 KTX,39060.0,39896.0,42005.0,43621.0,41702.0,41266.0,32427.0
호남선 KTX,7313.0,6967.0,6873.0,6626.0,8675.0,10622.0,9228.0
경전선 KTX,3627.0,4168.0,4088.0,4424.0,4606.0,4984.0,5570.0
전라선 KTX,309.0,1771.0,1954.0,2244.0,3146.0,3945.0,5766.0
동해선 KTX,,,,,2395.0,3786.0,6667.0


In [122]:
df_ktx[['동해선 KTX','호남선 KTX','경전선 KTX','전라선 KTX','경부선 KTX']]

Unnamed: 0,동해선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,경부선 KTX
2011,,7313,3627,309,39060
2012,,6967,4168,1771,39896
2013,,6873,4088,1954,42005
2014,,6626,4424,2244,43621
2015,2395.0,8675,4606,3146,41702
2016,3786.0,10622,4984,3945,41266
2017,6667.0,9228,5570,5766,32427


# 연산

In [123]:
sr_01 = pd.Series([1,2,3,4,5])
sr_02 = pd.Series([6,7,8,9,10])
sr_01 + sr_02

0     7
1     9
2    11
3    13
4    15
dtype: int64

In [124]:
sr_01 = pd.Series([1,2,3,4,5])
sr_02 = pd.Series([6,7,8,9])
sr_01 + sr_02

0     7.0
1     9.0
2    11.0
3    13.0
4     NaN
dtype: float64

In [125]:
table_01 = {'A':[1,2,3,4,5],
            'B':[6,7,8,9,10],
            'C':[11,12,13,14,15]}

In [134]:
table_02 = {'A':[11,12,13],
            'B':[16,17,18],
            'C':[21,22,23]}

In [135]:
df_01 = pd.DataFrame(table_01)
df_02 = pd.DataFrame(table_02)
df_01 + df_02

Unnamed: 0,A,B,C
0,12.0,22.0,32.0
1,14.0,24.0,34.0
2,16.0,26.0,36.0
3,,,
4,,,


# 열 추가

In [148]:
data_dict = {'Name':['John', 'Emma','Peter','Lisa'],
             'Age':[25,28,30,27],
             'City':['New York','Paris','London', 'Sydney']}

In [149]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Emma,28,Paris
2,Peter,30,London
3,Lisa,27,Sydney


In [150]:
df.insert(3, 'Brith Year', [1998, 1992, 2001, 1993])
df

Unnamed: 0,Name,Age,City,Brith Year
0,John,25,New York,1998
1,Emma,28,Paris,1992
2,Peter,30,London,2001
3,Lisa,27,Sydney,1993


In [147]:
df['Brith day'] = [1998, 1995, 2001, 1993]
df

Unnamed: 0,Name,Age,City,Brith day
0,John,25,New York,1998
1,Emma,28,Paris,1995
2,Peter,30,London,2001
3,Lisa,27,Sydney,1993


# 행 추가

In [154]:
df

Unnamed: 0,Name,Age,City,Brith Year
0,John,25,New York,1998
1,Emma,28,Paris,1992
2,Peter,30,London,2001
3,Lisa,27,Sydney,1993


In [159]:
new_row = {'Name':'Kim', 'Age': 17, 'City': 'ChangWon', 'Brith Year': 2002}
df = df.append(new_row, ignore_index=True)
df

AttributeError: 'DataFrame' object has no attribute 'append'

In [153]:
df.drop(1)

Unnamed: 0,Name,Age,City,Brith Year
0,John,25,New York,1998
2,Peter,30,London,2001
3,Lisa,27,Sydney,1993
