In [1]:
import numpy as np
import pandas as pd

# Numpy 教學
## 1. Numpy array 建立

In [2]:
Zeros = np.zeros((3, 3))
Ones = np.ones((3, 3))
Empty = np.empty((3, 3))
Arange = np.arange(0, 12, 3)
Line = np.linspace(0, 12, 3)
Reshape = Zeros.reshape(-1)
print(Zeros, Ones, Empty, Arange, Line, Reshape, sep = '\n\n')

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

[[0.0e+000 0.0e+000 0.0e+000]
 [0.0e+000 0.0e+000 8.5e-321]
 [0.0e+000 0.0e+000 0.0e+000]]

[0 3 6 9]

[ 0.  6. 12.]

[0. 0. 0. 0. 0. 0. 0. 0. 0.]


## 2. Numpy 計算

In [3]:
arr = np.arange(0, 12).reshape((3, 4))
sum1 = np.sum(arr, axis = 0)
sum2 = np.sum(arr, axis = 1)
sum3 = np.sum(arr, axis = (0, 1))
sum4 = np.sum(arr, axis = (1, 0))
sum5 = np.sum(arr)
print(arr, '\n')
print(f'sum1, sum2, sum3, sum4, sum5 = ({sum1}, {sum2}, {sum3}, {sum4}, {sum5})')

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] 

sum1, sum2, sum3, sum4, sum5 = ([12 15 18 21], [ 6 22 38], 66, 66, 66)


In [4]:
print(sum1 + sum1, sum1 - sum1, sum1 * 3, sum1 / 2, sep = '\n\n')
# sum1 + sum2

[24 30 36 42]

[0 0 0 0]

[36 45 54 63]

[ 6.   7.5  9.  10.5]


In [5]:
print(arr == 5, arr > 5, arr <5, sep = '\n\n')

[[False False False False]
 [False  True False False]
 [False False False False]]

[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]

[[ True  True  True  True]
 [ True False False False]
 [False False False False]]


## 3. Numpy IO

In [28]:
np.save('arr.npy', arr)
arr = np.load('arr.npy')

# Pandas 教學
## 1. Pandas 型別
* Series
* Dataframe

In [6]:
# ------------- Series -------------

# From numpy array
series1 = pd.Series(sum1)

# From Python's list
series2 = pd.Series([1, 2, 3, 4])

print(series1, series2, sep = '\n\n')

0    12
1    15
2    18
3    21
dtype: int32

0    1
1    2
2    3
3    4
dtype: int64


In [7]:
# ------------- Dataframe -------------
dates = pd.date_range('20210101',periods = 6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(4)])
print(df, '\n')
print(df.index, df.columns, df.describe(), sep = '\n\n')

                col0      col1      col2      col3
2021-01-01  2.387573 -0.742045  0.332856  0.935185
2021-01-02 -0.499635  0.511394  1.654337 -0.108647
2021-01-03  0.912221  0.635797  0.208728 -0.172415
2021-01-04  0.792097  0.721581 -0.250233 -0.941386
2021-01-05 -0.116520  0.441091  1.154207  0.200366
2021-01-06 -0.152512 -0.122539 -0.907870 -1.874231 

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['col0', 'col1', 'col2', 'col3'], dtype='object')

           col0      col1      col2      col3
count  6.000000  6.000000  6.000000  6.000000
mean   0.553871  0.240880  0.365338 -0.326855
std    1.058859  0.565422  0.928375  0.970999
min   -0.499635 -0.742045 -0.907870 -1.874231
25%   -0.143514  0.018368 -0.135493 -0.749143
50%    0.337788  0.476243  0.270792 -0.140531
75%    0.882190  0.604696  0.948869  0.123113
max    2.387573  0.721581  1.654337  0.935185


## 2. 索引
* simple
* loc
* iloc
* ix(deprecated in 0.20.0 version)

In [8]:
# simple(順序沒差)
print('------- simple demo -------')
print(df['col1'], df[['col1', 'col2']], df[:'2021-01-02']['col1'], df['col1'][:'2021-01-02'], sep = '\n\n')
print()

# loc (先 row 後 column)
print('------- loc demo -------')
print(df.loc[:'2021-01-02']['col1'], sep = '\n\n')
print()
# df['col1']

# iloc (先 row 後 column)
print('------- iloc demo -------')
print(df.iloc[:2][:3], sep = '\n\n')
print()

# !! ix had deprecated in Pandas == 0.20.0 !!
# ix (先 row 後 column)
# print(df.ix[:2][:col3], sep = '\n\n')
# print()


------- simple demo -------
2021-01-01   -0.742045
2021-01-02    0.511394
2021-01-03    0.635797
2021-01-04    0.721581
2021-01-05    0.441091
2021-01-06   -0.122539
Freq: D, Name: col1, dtype: float64

                col1      col2
2021-01-01 -0.742045  0.332856
2021-01-02  0.511394  1.654337
2021-01-03  0.635797  0.208728
2021-01-04  0.721581 -0.250233
2021-01-05  0.441091  1.154207
2021-01-06 -0.122539 -0.907870

2021-01-01   -0.742045
2021-01-02    0.511394
Freq: D, Name: col1, dtype: float64

2021-01-01   -0.742045
2021-01-02    0.511394
Freq: D, Name: col1, dtype: float64

------- loc demo -------
2021-01-01   -0.742045
2021-01-02    0.511394
Freq: D, Name: col1, dtype: float64

------- iloc demo -------
                col0      col1      col2      col3
2021-01-01  2.387573 -0.742045  0.332856  0.935185
2021-01-02 -0.499635  0.511394  1.654337 -0.108647



## 3. 合併與連接
- pandas.concat
- pandas.merge

In [14]:
# ------------- concat demo -------------
dates = pd.date_range('20210101',periods = 6)
tmp = pd.DataFrame(np.random.randn(6, 4), index=dates, columns = [f"col{i}" for i in range(2, 6)])
print(f'df: \n{df}\n')
print(f'tmp: \n{tmp}\n')

# ------- axis demo -------
print('------- axis demo -------')
df1 = pd.concat([df, tmp], axis = 1)
df2 = pd.concat([df, tmp], axis = 0)
print(f'axis = 1: \n{df1}\n')
print(f'axis = 0: \n{df2}\n')

# ------- join demo -------
print('------- join demo --------')
df1 = pd.concat([df, tmp], join = 'outer') # 聯集
df2 = pd.concat([df, tmp], join = 'inner') # 交集
print(f'join = outer: \n{df1}\n')
print(f'join = inner: \n{df2}\n')

df: 
                col0      col1      col2      col3
2021-01-01  2.387573 -0.742045  0.332856  0.935185
2021-01-02 -0.499635  0.511394  1.654337 -0.108647
2021-01-03  0.912221  0.635797  0.208728 -0.172415
2021-01-04  0.792097  0.721581 -0.250233 -0.941386
2021-01-05 -0.116520  0.441091  1.154207  0.200366
2021-01-06 -0.152512 -0.122539 -0.907870 -1.874231

tmp: 
                col2      col3      col4      col5
2021-01-01 -0.261600 -1.983739 -1.092057 -0.949890
2021-01-02 -1.017222  1.352584 -1.157505 -0.515708
2021-01-03 -0.013923  0.052556  0.562654  0.045690
2021-01-04 -0.555140 -0.679610 -0.399681  0.960551
2021-01-05  0.705528 -1.213308 -1.060807 -0.301329
2021-01-06 -1.535210  1.340594 -0.060831  0.187853

------- axis demo -------
axis = 1: 
                col0      col1      col2      col3      col2      col3  \
2021-01-01  2.387573 -0.742045  0.332856  0.935185 -0.261600 -1.983739   
2021-01-02 -0.499635  0.511394  1.654337 -0.108647 -1.017222  1.352584   
2021-01-03  0.

In [16]:
True in np.asarray(df1.isnull())

True

In [17]:
df.isnull()

Unnamed: 0,col0,col1,col2,col3
2021-01-01,False,False,False,False
2021-01-02,False,False,False,False
2021-01-03,False,False,False,False
2021-01-04,False,False,False,False
2021-01-05,False,False,False,False
2021-01-06,False,False,False,False


In [18]:
# ------- merge -------
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

print(f'left:\n{left}\n')


print(f'right:\n{right}\n')

# -------- demo how -------
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(f'how = inner:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(f'how = outer:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(f'how = left:\n{res}\n')

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(f'how = right:\n{res}\n')

left:
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3

right:
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3

how = inner:
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

how = outer:
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3

how = left:
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN

how = right:
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3



## 4. 統計函數

In [21]:
df.mean()

a   -0.159736
b    0.374111
c    0.106586
d    0.008795
e    0.385977
dtype: float64

In [23]:
df = pd.DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"])

# ------- 平均數 -------
print('------- 平均數 -------')
print (f"mean: \n{df.mean()}\n")

# ------- 標準差 -------
print('------- 標準差 -------')
print (f"std: \n{df.std()}\n")

# ------- 共變異數 -------
print('------- 共變異數 -------')
print (f"cov: \n{df.cov()}\n")

# Cov(a, b)
print (f"Cov(a, b) = {df['a'].cov(df['b'])}\n")

# Cov(all)
print (f"Cov(all): \n{df.cov()}\n")

# ------- 相關係數 -------
print('------- 相關係數 -------')
# Corr(a, b)
print (f"Corr(a, b) = {df['a'].corr(df['b'])}\n")

# Corr(all)
print (f"Corr(all): \n{df.corr()}\n")



------- 平均數 -------
mean: 
a   -0.214512
b   -0.293930
c    0.207071
d    0.426830
e    0.634168
dtype: float64

------- 標準差 -------
std: 
a    1.143991
b    1.165180
c    0.978917
d    1.212997
e    1.005530
dtype: float64

------- 共變異數 -------
cov: 
          a         b         c         d         e
a  1.308714  0.038922 -0.322195  0.345232 -0.290702
b  0.038922  1.357644  0.648490  0.430947 -0.792059
c -0.322195  0.648490  0.958279  0.038230 -0.464309
d  0.345232  0.430947  0.038230  1.471363  0.243782
e -0.290702 -0.792059 -0.464309  0.243782  1.011090

Cov(a, b) = 0.038922167716719705

Cov(all): 
          a         b         c         d         e
a  1.308714  0.038922 -0.322195  0.345232 -0.290702
b  0.038922  1.357644  0.648490  0.430947 -0.792059
c -0.322195  0.648490  0.958279  0.038230 -0.464309
d  0.345232  0.430947  0.038230  1.471363  0.243782
e -0.290702 -0.792059 -0.464309  0.243782  1.011090

------- 相關係數 -------
Corr(a, b) = 0.02919992023619405

Corr(all): 
          

## 5. 其他
- IO(Input & Output)
- Something about nan

In [40]:
# -------- IO -------
df = pd.read_csv('homework.csv')
# df.to_csv(index = False)

In [45]:
# ------- Something about nan -------
df = pd.read_csv('homework.csv')

# drop 
df1 = df.dropna(axis = 0, how = 'any')

# 補零
df2 = df.fillna(value = 0)

# 用一個 dictionary 填補
df3 = df.fillna(value = {f"col{i}" : i for i in range(df.shape[1])})

# 判斷 df 裡是否有 nan
True in np.asarray(df.isnull())

True