In [1]:
import pandas as pd
pd.__version__

'0.23.4'

In [2]:
import numpy as np 

In [3]:
data = pd.Series([0.25,0.5,0.75,1])

In [4]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [5]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [6]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
data[1]

0.5

In [8]:
data[1:3] #pandas series는 Numpy 1차원 배열보다 훨씬 일반적이고 유연하다고 한다. 

1    0.50
2    0.75
dtype: float64

In [9]:
data = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])

In [10]:
data.dtype

dtype('float64')

In [11]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
data['b']

0.5

In [13]:
data = pd.Series([0.25,0.5,0.75,1.0], index=[2,3,5,7])


In [14]:
data

2    0.25
3    0.50
5    0.75
7    1.00
dtype: float64

In [15]:
data[5]

0.75

# Series : 특수한 딕셔너리

In [16]:
population_dict = {'California':38332521, 'Texas': 26448193, 'New York' : 19651127}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
dtype: int64

In [17]:
#Series 객체 구성하기 
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [18]:
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [19]:
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In [20]:
pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])

3    c
2    a
dtype: object

# Pandas DataFrame 객체

In [21]:
#DataFrame : 일반화된 Numpy 배열
area_dict = {'California': 423967, 'Texas' : 695662, 'New York' : 141297, 'Florida':170312}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
dtype: int64

In [22]:
states = pd.DataFrame({'population' : population, 'area' : area})

In [23]:
states #pandas는 누락된 자리를 NaN 값으로 채운다. 

Unnamed: 0,population,area
California,38332521.0,423967
Florida,,170312
New York,19651127.0,141297
Texas,26448193.0,695662


In [24]:
states.index

Index(['California', 'Florida', 'New York', 'Texas'], dtype='object')

In [25]:
states.columns

Index(['population', 'area'], dtype='object')

In [26]:
states['area']

California    423967
Florida       170312
New York      141297
Texas         695662
Name: area, dtype: int64

In [27]:
# DataFrame 객체 구성하기
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127


In [28]:
data = [{'a':1,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,1,0
1,1,2
2,1,4


In [29]:
data

[{'a': 1, 'b': 0}, {'a': 1, 'b': 2}, {'a': 1, 'b': 4}]

In [30]:
type(data)

list

In [31]:
pd.DataFrame({'population':population,'area':area})

Unnamed: 0,population,area
California,38332521.0,423967
Florida,,170312
New York,19651127.0,141297
Texas,26448193.0,695662


In [32]:
# np.random 복습 
np.random.random((3,3)) 

array([[0.06827314, 0.46599793, 0.70742242],
       [0.49827545, 0.89643871, 0.60025462],
       [0.48838632, 0.34723541, 0.21031732]])

In [33]:
# 정규 분포(평균=0, 표준편차=1의 난수로 채운 3x3 배열 만들기)
np.random.normal(0,1,(3,3))

array([[ 0.67504422, -2.44726823,  1.11919137],
       [ 1.77751603, -2.89613106,  0.67372356],
       [ 1.19165848,  0.53071777,  0.63025813]])

In [34]:
#0부터 10사이의 random한 정수로 3,3 배열 만들기 
np.random.randint(0,10,(3,3))

array([[0, 4, 7],
       [4, 7, 0],
       [6, 6, 6]])

In [35]:
np.random.randint(10,size=(3,3))

array([[1, 5, 7],
       [4, 2, 4],
       [4, 2, 7]])

In [36]:
# n 개의 0~1 의 난수 배열 
np.random.rand(10)

array([0.36900631, 0.03852395, 0.9707316 , 0.86312248, 0.8837734 ,
       0.07376819, 0.87039264, 0.83441125, 0.72018243, 0.39596629])

In [37]:
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.517213,0.146685
b,0.567158,0.821809
c,0.615965,0.099247


# Pandas Index 객체

In [38]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [39]:
ind[1]

3

In [40]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [41]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [42]:
#Index의 개체는 변경될 수 없다.
ind[1]=0

TypeError: Index does not support mutable operations

In [43]:
#index : 정렬된 집합 
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [44]:
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [45]:
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [46]:
indA ^ indB # 두 집합의 상대여집합의 합 

Int64Index([1, 2, 9, 11], dtype='int64')

# 데이터 인덱싱과 선택

In [47]:
data = pd.Series([0.25,0.5,0.75,1.0], index = ['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [48]:
data['b']

0.5

In [49]:
'a' in data

True

In [50]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [51]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [52]:
data.items() #list -> 리스트로 만들어줌

<zip at 0x2b8c3b2d6c8>

In [53]:
data['e'] = 1.25

In [54]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [55]:
#Series : 1차원 배열 
#명시적인 인덱스로 슬라이싱하기
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [56]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [57]:
data[(data>0.3)&(data<0.8)]

b    0.50
c    0.75
dtype: float64

In [58]:
data[['a','e']]

a    0.25
e    1.25
dtype: float64

In [59]:
data = pd.Series(['a','b','c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [60]:
# data[1:3]이라고 할 때 혼선이 생길 수 있다. 명시적인 것을 무조건 사용하는 loc, 암묵적인 것을 무조건 사용한 iloc를 사용할 수 있다. 
data.loc[1:3]

1    a
3    b
dtype: object

In [61]:
data.iloc[1:3]

3    b
5    c
dtype: object

# DataFrame에서 데이터선택

In [62]:
#DataFrame : 딕셔너리
area

California    423967
Texas         695662
New York      141297
Florida       170312
dtype: int64

In [63]:
population

California    38332521
Texas         26448193
New York      19651127
dtype: int64

In [64]:
data = pd.DataFrame({'area':area,'population':population})

In [65]:
data

Unnamed: 0,area,population
California,423967,38332521.0
Florida,170312,
New York,141297,19651127.0
Texas,695662,26448193.0


In [66]:
data['area']

California    423967
Florida       170312
New York      141297
Texas         695662
Name: area, dtype: int64

In [67]:
data.area

California    423967
Florida       170312
New York      141297
Texas         695662
Name: area, dtype: int64

In [68]:
data['density'] = data['population']/data['area']
data

Unnamed: 0,area,population,density
California,423967,38332521.0,90.413926
Florida,170312,,
New York,141297,19651127.0,139.076746
Texas,695662,26448193.0,38.01874


In [69]:
#DataFrame:2차원배열 
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [1.70312000e+05,            nan,            nan],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])

In [70]:
data.T

Unnamed: 0,California,Florida,New York,Texas
area,423967.0,170312.0,141297.0,695662.0
population,38332520.0,,19651130.0,26448190.0
density,90.41393,,139.0767,38.01874
