In [1]:
import numpy as np, pandas as pd

In [13]:
# universal functions(ufunc): 인덱스 보존
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
print(ser, "\n")

df = pd.DataFrame(rng.randint(0, 10, (3,4)),
                 columns=["A","B","C","D"])
print(df)

0    6
1    3
2    7
3    4
dtype: int32 

   A  B  C  D
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


In [16]:
# NumPy ufunc -> index maintains
print(np.exp(ser), "\n")
print(np.sin(df * np.pi / 4))

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64 

          A             B         C             D
0 -1.000000  7.071068e-01  1.000000 -1.000000e+00
1 -0.707107  1.224647e-16  0.707107 -7.071068e-01
2 -0.707107  1.000000e+00 -0.707107  1.224647e-16


In [23]:
# universal functions(ufunc): 인덱스 정렬
# Series 인덱스 정렬
area = pd.Series({"Alaska": 1723337,
                  "Texas": 695662,
                  "California": 423967,}, name="area")

population = pd.Series({"California": 38332521,
                        "Texas": 26448193,
                        "New York": 19651127,}, name="population")

print(area, "\n")
print(population, "\n")

population / area # NaN = Not a Number

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64 

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64 



Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [30]:
A = pd.Series([2,4,6], index=[0,1,2,])
B = pd.Series([1,3,5], index=[1,2,3,])
# index - A.index| B.index
print(A.index.union(B.index), "\n")

A+B

Int64Index([0, 1, 2, 3], dtype='int64') 



0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [33]:
# 누락된 요소 값 명시적 지적 -> add.(fill_value=)
A.add(B, fill_value = 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [54]:
# DataFrame 인덱스 정렬
A = pd.DataFrame(rng.randint(0, 20, (2,2)),
                 columns=list("AB"))
print(A, "\n")

B = pd.DataFrame(rng.randint(0, 10, (3,3)),
                 columns=list("BAC"))
print(B, "\n")
print(A + B)

# NaN 명시적 지적
fill = A.stack().mean() # A 모든 값의 평균
A.add(B, fill_value=fill)

   A   B
0  5  15
1  2  19 

   B  A  C
0  3  2  9
1  2  2  3
2  6  3  8 

     A     B   C
0  7.0  18.0 NaN
1  4.0  21.0 NaN
2  NaN   NaN NaN


Unnamed: 0,A,B,C
0,7.0,18.0,19.25
1,4.0,21.0,13.25
2,13.25,16.25,18.25


In [60]:
# ufunc: DataFrame & Series
A = rng.randint(10, size=(3,4))
print(A)

# A[0] -> first row
# Broadcasting
A - A[0]

[[3 5 7 3]
 [2 8 2 8]
 [1 1 1 5]]


array([[ 0,  0,  0,  0],
       [-1,  3, -5,  5],
       [-2, -4, -6,  2]])

In [88]:
df = pd.DataFrame(A, columns=list("QRST"))
print(df, "\n")
# 행 방향
print(df - df.iloc[0], "\n") # df.iloc[0] -> first row

# 열 방향 (axis 지정)
print(df.subtract(df["R"], axis=0), "\n")

# 인덱스 자동맞춤
halfrow = df.iloc[0, ::2]
print(halfrow)

df - halfrow

   Q  R  S  T
0  3  5  7  3
1  2  8  2  8
2  1  1  1  5 

   Q  R  S  T
0  0  0  0  0
1 -1  3 -5  5
2 -2 -4 -6  2 

   Q  R  S  T
0 -2  0  2 -2
1 -6  0 -6  0
2  0  0  0  4 

Q    3
S    7
Name: 0, dtype: int32


Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,-5.0,
2,-2.0,,-6.0,


In [96]:
# 누락된 데이터 처리하기 (null, NaN, NA)

# None: 파이썬의 누락된 데이터
vals1 = np.array([1, None, 3, 4])
vals1 # dtype = ojbect (파이썬 객체 -> 파이썬 수준 연산)

for dtype in ["object", "int"]:
    print("dtype=", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print() # obejct dtype is slower than int dtype

dtype= object
42 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype= int
1.7 ms ± 20.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)



In [97]:
# None: 집계 연산 오류
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [117]:
# NaN: 누락된 숫자 데이터
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [118]:
# NaN: 데이터 바이러스 (접촉 객체 감염) -> NaN 포함 산술 결과 = NaN
print(1 + np.nan)
print(0 * np.nan)
print(vals2.sum(), vals2.min(), vals2.max())

# 누락된 값 무시 집계 연산 (nansum, nanmin, nanmax)
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2))

nan
nan
nan nan nan
8.0 1.0 4.0


In [129]:
# NaN & None in Pandas
# 호환성 처리, 서로 변환
print(pd.Series([1, np.nan, 2, None]), "\n")

# np.nan -> 부동 소수점 타입
x = pd.Series(range(2), dtype=int)
print(x, "\n")

x[0] = None # None changed to NaN
print(x) # array type changed to float

# Pandas에서 문자열 데이터 dtype은 항상 object

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64 

0    0
1    1
dtype: int32 

0    NaN
1    1.0
dtype: float64


In [142]:
# null 값 연산
# isnull(), notnull(), dropna(), fillna()

# null 값 탐지
data = pd.Series([1, np.nan, "hello", None])

print(data.isnull(), "\n") # return bool mask
print(data[data.notnull()], "\n") # book masking

0    False
1     True
2    False
3     True
dtype: bool 

0        1
2    hello
dtype: object 



In [170]:
# null 값 제거
print(data.dropna(), "\n") # Series

df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6],])
print(df, "\n")

# dropna() -> drops all rows / cols(axis =1)
print(df.dropna(), "\n") # drops rows w null value
print(df.dropna(axis="columns"), "\n") # drops cols w null value

# how 매개변수 (any, all)
df[3] = np.nan

print(df, "\n")
print(df.dropna(axis="columns", how="all"), "\n") # 모두 null 값인 행/열 삭제
print(df.dropna(axis="columns", how="any"), "\n") # 하나라도 null 값인 행/열 삭제
print(df.dropna(axis="rows", thresh=3)) # thresh 매개변수 -> null 값 최소 몇개

0        1
2    hello
dtype: object 

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6 

     0    1  2
1  2.0  3.0  5 

   2
0  2
1  5
2  6 

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN 

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6 

   2
0  2
1  5
2  6 

     0    1  2   3
1  2.0  3.0  5 NaN


In [184]:
# null 값 채우기
data = pd.DataFrame([1, np.nan, 2, None, 3], index=list("abcde"))
print(data, "\n")

# null 값에 0 채우기
print(data.fillna(0), "\n")

# null 값에 이전 값 채우기 (forward-fill)
print(data.fillna(method="ffill"), "\n")

# null 값에 뒤에 값 채우기 (back-fill)
print(data.fillna(method="bfill"), "\n")

# 채우기 axis 설정
print(df, "\n")
print(df.fillna(method="ffill", axis=1), "\n")

     0
a  1.0
b  NaN
c  2.0
d  NaN
e  3.0 

     0
a  1.0
b  0.0
c  2.0
d  0.0
e  3.0 

     0
a  1.0
b  1.0
c  2.0
d  2.0
e  3.0 

     0
a  1.0
b  2.0
c  2.0
d  3.0
e  3.0 

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN 

     0    1    2    3
0  1.0  1.0  2.0  2.0
1  2.0  3.0  5.0  5.0
2  NaN  4.0  6.0  6.0 

