In [1]:
import numpy as np
import pandas as pd

# Numpy 教學
## 1. Numpy array 建立

In [2]:
Zeros = np.zeros((3, 3))
Ones = np.ones((3, 3))
Empty = np.empty((3, 3))
Arange = np.arange(0, 12, 3)
Line = np.linspace(0, 12, 3)
Reshape = Zeros.reshape(-1)
print(Zeros, Ones, Empty, Arange, Line, Reshape, sep = '\n\n')

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

[[0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 5.11852009e-321]
 [2.46600324e-154 4.47593775e-091 5.98149672e-154]]

[0 3 6 9]

[ 0.  6. 12.]

[0. 0. 0. 0. 0. 0. 0. 0. 0.]


## 2. Numpy 計算

In [3]:
arr = np.arange(0, 12).reshape((3, 4))
sum1 = np.sum(arr, axis = 0)
sum2 = np.sum(arr, axis = 1)
sum3 = np.sum(arr, axis = (0, 1))
sum4 = np.sum(arr, axis = (1, 0))
sum5 = np.sum(arr)
print(arr, '\n')
print(f'sum1, sum2, sum3, sum4, sum5 = ({sum1}, {sum2}, {sum3}, {sum4}, {sum5})')

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] 

sum1, sum2, sum3, sum4, sum5 = ([12 15 18 21], [ 6 22 38], 66, 66, 66)


In [4]:
print(sum1 + sum1, sum1 - sum1, sum1 * 3, sum1 / 2, sep = '\n\n')
# sum1 + sum2

[24 30 36 42]

[0 0 0 0]

[36 45 54 63]

[ 6.   7.5  9.  10.5]


In [5]:
print(arr == 5, arr > 5, arr <5, sep = '\n\n')

[[False False False False]
 [False  True False False]
 [False False False False]]

[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]

[[ True  True  True  True]
 [ True False False False]
 [False False False False]]


## 3. Numpy IO

In [6]:
np.save('arr.npy', arr)
arr = np.load('arr.npy')

# Pandas 教學
## 1. Pandas 型別
* Series
* Dataframe

In [7]:
# ------------- Series -------------

# From numpy array
series1 = pd.Series(sum1)

# From Python's list
series2 = pd.Series([1, 2, 3, 4])

print(series1, series2, sep = '\n\n')

0    12
1    15
2    18
3    21
dtype: int32

0    1
1    2
2    3
3    4
dtype: int64


In [8]:
# ------------- Dataframe -------------
dates = pd.date_range('20210101',periods = 6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(4)])
print(df, '\n')
print(df.index, df.columns, df.describe(), sep = '\n\n')

                col0      col1      col2      col3
2021-01-01  0.389247  2.672891 -0.423461  0.284688
2021-01-02  1.138336  0.735363 -0.744718 -0.191010
2021-01-03 -1.260030  0.393373  0.592346 -0.186350
2021-01-04  0.319281  0.791297 -0.490501  0.061331
2021-01-05  0.335734 -0.386365 -1.347404  1.166252
2021-01-06  0.343221  0.351542  0.562547 -1.815466 

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['col0', 'col1', 'col2', 'col3'], dtype='object')

           col0      col1      col2      col3
count  6.000000  6.000000  6.000000  6.000000
mean   0.210965  0.759683 -0.308532 -0.113426
std    0.787454  1.027243  0.759762  0.973128
min   -1.260030 -0.386365 -1.347404 -1.815466
25%    0.323394  0.362000 -0.681164 -0.189845
50%    0.339477  0.564368 -0.456981 -0.062509
75%    0.377740  0.777313  0.316045  0.228849
max    1.138336  2.672891  0.592346  1.166252


## 2. 索引
* simple
* loc
* iloc
* ix(deprecated in 0.20.0 version)

In [9]:
# simple(順序沒差)
print('------- simple demo -------')
print(df['col1'], df[['col1', 'col2']], df[:'2021-01-02']['col1'], df['col1'][:'2021-01-02'], sep = '\n\n')
print()

# loc (先 row 後 column)
print('------- loc demo -------')
print(df.loc[:'2021-01-02']['col1'], sep = '\n\n')
print()
# df['col1']

# iloc (先 row 後 column)
print('------- iloc demo -------')
print(df.iloc[:2][:3], sep = '\n\n')
print()

# !! ix had deprecated in Pandas == 0.20.0 !!
# ix (先 row 後 column)
# print(df.ix[:2][:col3], sep = '\n\n')
# print()


------- simple demo -------
2021-01-01    2.672891
2021-01-02    0.735363
2021-01-03    0.393373
2021-01-04    0.791297
2021-01-05   -0.386365
2021-01-06    0.351542
Freq: D, Name: col1, dtype: float64

                col1      col2
2021-01-01  2.672891 -0.423461
2021-01-02  0.735363 -0.744718
2021-01-03  0.393373  0.592346
2021-01-04  0.791297 -0.490501
2021-01-05 -0.386365 -1.347404
2021-01-06  0.351542  0.562547

2021-01-01    2.672891
2021-01-02    0.735363
Freq: D, Name: col1, dtype: float64

2021-01-01    2.672891
2021-01-02    0.735363
Freq: D, Name: col1, dtype: float64

------- loc demo -------
2021-01-01    2.672891
2021-01-02    0.735363
Freq: D, Name: col1, dtype: float64

------- iloc demo -------
                col0      col1      col2      col3
2021-01-01  0.389247  2.672891 -0.423461  0.284688
2021-01-02  1.138336  0.735363 -0.744718 -0.191010



## 3. 合併與連接
- pandas.concat
- pandas.merge

In [10]:
# ------------- concat demo -------------
dates = pd.date_range('20210101',periods = 6)
tmp = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(2, 6)])
print(f'df: \n{df}\n')
print(f'tmp: \n{tmp}\n')

# ------- axis demo -------
print('------- axis demo -------')
df1 = pd.concat([df, tmp], axis = 1)
df2 = pd.concat([df, tmp], axis = 0)
print(f'axis = 1: \n{df1}\n')
print(f'axis = 0: \n{df2}\n')

# ------- join demo -------
print('------- join demo --------')
df1 = pd.concat([df, tmp], join = 'outer') # 聯集
df2 = pd.concat([df, tmp], join = 'inner') # 交集
print(f'join = outer: \n{df1}\n')
print(f'join = inner: \n{df2}\n')

df: 
                col0      col1      col2      col3
2021-01-01  0.389247  2.672891 -0.423461  0.284688
2021-01-02  1.138336  0.735363 -0.744718 -0.191010
2021-01-03 -1.260030  0.393373  0.592346 -0.186350
2021-01-04  0.319281  0.791297 -0.490501  0.061331
2021-01-05  0.335734 -0.386365 -1.347404  1.166252
2021-01-06  0.343221  0.351542  0.562547 -1.815466

tmp: 
                col2      col3      col4      col5
2021-01-01  0.112921  0.968978 -0.427311 -0.265568
2021-01-02 -0.300447 -0.186672  1.255382  1.379092
2021-01-03 -0.308274  0.042361 -0.139584 -0.470782
2021-01-04 -0.390318  0.041332 -1.505120  1.192882
2021-01-05 -0.911402  1.236349  0.182561  0.434086
2021-01-06  0.755051 -0.638576  0.394323  0.229014

------- axis demo -------
axis = 1: 
                col0      col1      col2      col3      col2      col3  \
2021-01-01  0.389247  2.672891 -0.423461  0.284688  0.112921  0.968978   
2021-01-02  1.138336  0.735363 -0.744718 -0.191010 -0.300447 -0.186672   
2021-01-03 -1.

In [11]:
# ------- merge -------
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

print(f'left:\n{left}\n')


print(f'right:\n{right}\n')

# -------- demo how -------
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(f'how = inner:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(f'how = outer:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(f'how = left:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(f'how = right:\n{res}\n')

left:
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3

right:
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3

how = inner:
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

how = outer:
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3

how = left:
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN

how = right:
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3



## 4. 統計函數

In [12]:
df = pd.DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"])

# ------- 平均數 -------
print('------- 平均數 -------')
print (f"mean: \n{df.mean()}\n")

# ------- 標準差 -------
print('------- 標準差 -------')
print (f"std: \n{df.std()}\n")

# ------- 共變異數 -------
print('------- 共變異數 -------')

# Cov(a, b)
print (f"Cov(a, b) = {df['a'].cov(df['b'])}\n")

# Cov(all)
print (f"Cov(all): \n{df.cov()}\n")

# ------- 相關係數 -------
print('------- 相關係數 -------')
# Corr(a, b)
print (f"Corr(a, b) = {df['a'].corr(df['b'])}\n")

# Corr(all)
print (f"Corr(all): \n{df.corr()}\n")



------- 平均數 -------
mean: 
a   -0.034391
b    0.189421
c   -0.064412
d   -0.127408
e    0.168284
dtype: float64

------- 標準差 -------
std: 
a    0.871396
b    0.879194
c    0.592128
d    0.953212
e    0.475951
dtype: float64

------- 共變異數 -------
Cov(a, b) = 0.19876609993619887

Cov(all): 
          a         b         c         d         e
a  0.759331  0.198766 -0.197810 -0.616806 -0.032563
b  0.198766  0.772983 -0.334733 -0.412980  0.056339
c -0.197810 -0.334733  0.350616  0.247575  0.052164
d -0.616806 -0.412980  0.247575  0.908613 -0.020683
e -0.032563  0.056339  0.052164 -0.020683  0.226530

------- 相關係數 -------
Corr(a, b) = 0.25944288890836653

Corr(all): 
          a         b         c         d         e
a  1.000000  0.259443 -0.383368 -0.742580 -0.078514
b  0.259443  1.000000 -0.642980 -0.492781  0.134635
c -0.383368 -0.642980  1.000000  0.438633  0.185094
d -0.742580 -0.492781  0.438633  1.000000 -0.045590
e -0.078514  0.134635  0.185094 -0.045590  1.000000



## 5. 其他
- IO(Input & Output)
- Something about nan

In [13]:
# -------- IO -------
df = pd.read_csv('homework.csv')
# df.to_csv(index = False)

FileNotFoundError: [Errno 2] No such file or directory: 'homework.csv'

In [None]:
# ------- Something about nan -------
df = pd.read_csv('homework.csv')

# drop 
df1 = df.dropna(axis = 0, how = 'any')

# 補零
df2 = df.fillna(value = 0)

# 用一個 dictionary 填補
df3 = df.fillna(value = {f"col{i}" : i for i in range(df.shape[1])})

# 判斷 df 裡是否有 nan
True in np.asarray(df.isnull())