In [1]:
import pandas as pd
area_dict = {'California': 423967, 'Texas' : 695662, 'New York' : 141297, 'Florida':170312}
population_dict = {'California':38332521, 'Texas': 26448193, 'New York' : 19651127, 'Florida':19552860}
area = pd.Series(area_dict)
population = pd.Series(population_dict)
data = pd.DataFrame({'population':population, 'area':area})

In [2]:
data

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312


In [3]:
data.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312]], dtype=int64)

In [4]:
data.T

Unnamed: 0,California,Texas,New York,Florida
population,38332521,26448193,19651127,19552860
area,423967,695662,141297,170312


In [5]:
data

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312


In [6]:
data.values[0]

array([38332521,   423967], dtype=int64)

In [7]:
data2 = data.T
data2.values[0]

array([38332521, 26448193, 19651127, 19552860], dtype=int64)

In [8]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Name: area, dtype: int64

In [9]:
data.iloc[:3,:2]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [10]:
data.ix[:3,:'area']
# iloc와 loc의 hybrid 방식. 

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [11]:
data['density']= data['population']/data['area']

In [12]:
data

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121


In [13]:
data.loc[data.density>100,['population','density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [14]:
data.density>100

California    False
Texas         False
New York       True
Florida        True
Name: density, dtype: bool

In [15]:
import numpy as np
x = np.arange(10)
x[x>4]
x>4

array([False, False, False, False, False,  True,  True,  True,  True,
        True])

In [16]:
x[[False, False, False, False, False,  True,  True,  True,  True,
        True]]

array([5, 6, 7, 8, 9])

In [17]:
data[[False,False,True,True]] #bool type이 index로 기능할 수 있다는 것을 알게됨. 

Unnamed: 0,population,area,density
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121


# 유니버설 함수: Index 보존

In [18]:
import pandas as pd
import numpy as np

In [19]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [20]:
df = pd.DataFrame(rng.randint(0,10,(3,4)), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [21]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [22]:
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [23]:
#NaN 값을 사용하지 않길 원하는 경우
A = pd.Series([2,4,6], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [24]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [25]:
A = pd.DataFrame(rng.randint(0,20,(2,2)), columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [26]:
B = pd.DataFrame(rng.randint(0,10,(3,3)), columns = list('BAC'))
B


Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [27]:
A + B 

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [28]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [32]:
A.stack()

0  A     1
   B    11
1  A     5
   B     1
dtype: int32

In [33]:
B.stack()

0  B    4
   A    0
   C    9
1  B    5
   A    8
   C    0
2  B    9
   A    2
   C    6
dtype: int32

In [34]:
A.mean()

A    3.0
B    6.0
dtype: float64

# DataFrame과 Series간의 연산

In [35]:
A = rng. randint(10,size=(3,4))
A

array([[3, 8, 2, 4],
       [2, 6, 4, 8],
       [6, 1, 3, 8]])

In [36]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])

In [37]:
# Pandas에서도 연산 규칙이 기본적으로 행 방향으로 적용된다.
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,-2,2,4
2,3,-7,1,4


In [38]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [39]:
halfrow = df.iloc[0,::2]
halfrow

Q    3
S    2
Name: 0, dtype: int32

In [40]:
type(halfrow)

pandas.core.series.Series

# 누락된 데이터 처리하기

In [41]:
import numpy as np
import pandas as pd

In [42]:
vals1 = np.array([1, None, 3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [44]:
for dtype in ['object','int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum
    print()


dtype = object
48.2 ms ± 7.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
2.49 ms ± 47.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [45]:
vals1.sum()
#이 코드는 정수와 None의 덧셈이 정의돼 있지 않다는 사실을 보여준다.

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [46]:
vals2=np.array([1,np.nan,3,4])
vals2.dtype
# Numpy가 이 배열에 대해 기본 부동 소수점 타입을 선택했다는 사실에 주목 - 곧 vals1 객체 배열과는 달리 이 배열은 컴파일된 코드에 삽입된
# 빠른 연산을 지원한다는 뜻

dtype('float64')

In [48]:
# 어떤 연산이든 상관없이 NaN이 포함된 산술 연산의 결과는 또 다른 NaN이 된다. 
print(1 + np.nan)
print(0*np.nan)

nan
nan


In [49]:
vals2.sum(), vals2.min(), vals2.max()

  return umr_minimum(a, axis, None, out, keepdims, initial)
  return umr_maximum(a, axis, None, out, keepdims, initial)


(nan, nan, nan)

In [50]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)
# 누락된 값을 무시하는 특별한 집계연산

(8.0, 1.0, 4.0)

In [52]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [54]:
x = pd.Series(range(2),dtype=int)
x

0    0
1    1
dtype: int32

In [55]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

# 널 값 연산하기

In [56]:
data = pd.Series([1, np.nan, 'hello', None])


In [57]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [58]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [59]:
data.dropna()

0        1
2    hello
dtype: object

In [61]:
df = pd.DataFrame([[1,np.nan,2],
                  [2,3,5],
                  [np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [62]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [63]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [64]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [65]:
df.dropna(axis='columns',how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [66]:
df.dropna(axis='rows',thresh=3)
# 첫번째와 마지막 행이 사라지는데, 거기에서 단 두개의 값만이 널 값이 아니기 때문

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [67]:
# 널 값 채우기
data = pd.Series([1,np.nan,2,None,3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [68]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [69]:
data.fillna(method='ffill') #이전값으로 채우기

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [72]:
data.fillna(method='bfill') #다음에 오는 값으로 채우기

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [73]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [74]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


# 다중 인덱스된 Series

In [75]:
# 나쁜방식
index = [('California', 2000), ('California',2010),('New York',2000), ('New York',2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [80]:
#편리함은 슬라이싱, 인덱싱까지. 2010년 값을 찾으려면 이렇게 해야함
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [79]:
pop.index

Index([('California', 2000), ('California', 2010),   ('New York', 2000),
         ('New York', 2010),      ('Texas', 2000),      ('Texas', 2010)],
      dtype='object')

In [82]:
pop[[1]]

(California, 2010)    37253956
dtype: int64