### Pandas

- https://pbpython.com/pandas-release-17.html

In [1]:
from __future__ import print_function
import pandas as pd

sales_df = pd.read_excel('data/df-sample-sales3.xlsx')
sales_df.head()

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date
0,740150,Barton LLC,B1-20000,39,86.69,3380.91,2014-01-01 07:21:51
1,714466,Trantow-Barrows,S2-77896,-1,63.16,-63.16,2014-01-01 10:00:47
2,218895,Kulas Inc,B1-69924,23,90.7,2086.1,2014-01-01 13:24:58
3,307599,"Kassulke, Ondricka and Metz",S1-65481,41,21.05,863.05,2014-01-01 15:05:22
4,412290,Jerde-Hilpert,S2-34077,6,83.21,499.26,2014-01-01 23:26:55


我們再次使用我們的簡單銷售數據來顯示幾個月的購買歷史。 prod_group列只是一種根據SKU將產品分成不同類別的方法。

現在，讓我們創建一個數據透視表。這裡的關鍵是，數據透視表創建了一個MultiIndex，當我們試圖保存到Excel時，這將導致問題。

In [2]:
sales_df["prod_group"] = sales_df["sku"].str[0:2]
sales_df.head()

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date,prod_group
0,740150,Barton LLC,B1-20000,39,86.69,3380.91,2014-01-01 07:21:51,B1
1,714466,Trantow-Barrows,S2-77896,-1,63.16,-63.16,2014-01-01 10:00:47,S2
2,218895,Kulas Inc,B1-69924,23,90.7,2086.1,2014-01-01 13:24:58,B1
3,307599,"Kassulke, Ondricka and Metz",S1-65481,41,21.05,863.05,2014-01-01 15:05:22,S1
4,412290,Jerde-Hilpert,S2-34077,6,83.21,499.26,2014-01-01 23:26:55,S2


In [3]:
sales_summary = pd.pivot_table(sales_df, index=["name"],
                               columns=["prod_group"],
                               values=["ext price", "quantity"])
sales_summary

Unnamed: 0_level_0,ext price,ext price,ext price,quantity,quantity,quantity
prod_group,B1,S1,S2,B1,S1,S2
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Barton LLC,1171.640278,1306.2376,1647.78619,20.777778,28.04,28.190476
"Cronin, Oberbrunner and Spencer",1387.616842,1342.598571,1302.787407,26.210526,25.428571,23.740741
"Frami, Hills and Schmidt",1475.132143,1439.311875,1401.317857,25.071429,31.0625,25.142857
"Fritsch, Russel and Anderson",1372.360286,1188.012857,1569.3532,23.714286,26.047619,29.4
"Halvorson, Crona and Champlin",1356.64,1267.756667,983.963158,25.857143,22.333333,17.842105
Herman LLC,1363.706923,980.005,1586.427,25.615385,20.375,27.3
Jerde-Hilpert,1361.323333,1384.032174,1085.909697,23.0,24.217391,20.69697
"Kassulke, Ondricka and Metz",1092.958824,1450.143333,1440.282414,26.235294,26.388889,25.034483
Keeling LLC,1495.175484,1544.883529,1089.263077,24.580645,28.294118,21.653846
Kiehn-Spinka,1330.110833,1067.693846,1376.761034,22.791667,19.961538,23.793103


對數據進行舍入

一個簡單（但很有用）的新功能是為DataFrame增加了一個取整方法。在這個版本之前，如果你想對某一列的數據進行舍入，你必須使用np.round或者使用display.float_format來改變顯示。現在我可以控制四捨五入的小數位數，也可以通過在列名字典中指定哪些列要四捨五入。這是很有用的。

In [4]:
sales_df = sales_df[0:5]
sales_df.round({'ext price':0, 'unit price': 0})

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date,prod_group
0,740150,Barton LLC,B1-20000,39,87.0,3381.0,2014-01-01 07:21:51,B1
1,714466,Trantow-Barrows,S2-77896,-1,63.0,-63.0,2014-01-01 10:00:47,S2
2,218895,Kulas Inc,B1-69924,23,91.0,2086.0,2014-01-01 13:24:58,B1
3,307599,"Kassulke, Ondricka and Metz",S1-65481,41,21.0,863.0,2014-01-01 15:05:22,S1
4,412290,Jerde-Hilpert,S2-34077,6,83.0,499.0,2014-01-01 23:26:55,S2


In [5]:
sales_summary.to_excel('data/df-sample-sales3-out.xlsx')

### Troubleshooting Merges 解決合併的問題
在我處理和合併數據的經驗中，pandas傾向於做我所期望的事情。我發現自己被絆倒的地方之一是當我做一個DataFrame的合併。有時我需要考慮我是否真的想要一個左、右、外或內連接。為了幫助解決這些討厭的問題，有一個新的指標參數，可以幫助你找出為什麼你的合併可能沒有做你所期望的事情。

在這個例子中，讓我們在我們的數據中加入一個描述。這裡是sales_groups的數據框架。

In [6]:
import pandas as pd

dt  = {'prod_group': ["B1", "B2", "S1", "S2", "H1"], 
       'Desc': ["Belt-Large", "Belt-Medium", "Shirt-Large", "Shirt-Medium", "Hat-Large"]}

sales_groups = pd.DataFrame(dt)
sales_groups

Unnamed: 0,prod_group,Desc
0,B1,Belt-Large
1,B2,Belt-Medium
2,S1,Shirt-Large
3,S2,Shirt-Medium
4,H1,Hat-Large


如果我們想把sales_groups數據與我們的銷售數據合併，我們可以這樣做。

In [7]:
pd.merge(sales_df, sales_groups, on='prod_group', how='left', indicator=True)

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date,prod_group,Desc,_merge
0,740150,Barton LLC,B1-20000,39,86.69,3380.91,2014-01-01 07:21:51,B1,Belt-Large,both
1,714466,Trantow-Barrows,S2-77896,-1,63.16,-63.16,2014-01-01 10:00:47,S2,Shirt-Medium,both
2,218895,Kulas Inc,B1-69924,23,90.7,2086.1,2014-01-01 13:24:58,B1,Belt-Large,both
3,307599,"Kassulke, Ondricka and Metz",S1-65481,41,21.05,863.05,2014-01-01 15:05:22,S1,Shirt-Large,both
4,412290,Jerde-Hilpert,S2-34077,6,83.21,499.26,2014-01-01 23:26:55,S2,Shirt-Medium,both


讓我們看看當我們做一個右鍵連接時會發生什麼。

In [8]:
pd.merge(sales_df, sales_groups, on='prod_group', how='right', indicator=True)

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date,prod_group,Desc,_merge
0,740150.0,Barton LLC,B1-20000,39.0,86.69,3380.91,2014-01-01 07:21:51,B1,Belt-Large,both
1,218895.0,Kulas Inc,B1-69924,23.0,90.7,2086.1,2014-01-01 13:24:58,B1,Belt-Large,both
2,,,,,,,,B2,Belt-Medium,right_only
3,307599.0,"Kassulke, Ondricka and Metz",S1-65481,41.0,21.05,863.05,2014-01-01 15:05:22,S1,Shirt-Large,both
4,714466.0,Trantow-Barrows,S2-77896,-1.0,63.16,-63.16,2014-01-01 10:00:47,S2,Shirt-Medium,both
5,412290.0,Jerde-Hilpert,S2-34077,6.0,83.21,499.26,2014-01-01 23:26:55,S2,Shirt-Medium,both
6,,,,,,,,H1,Hat-Large,right_only


In [9]:
sales_df["sku"].value_counts()

B1-20000    1
S2-77896    1
B1-69924    1
S1-65481    1
S2-34077    1
Name: sku, dtype: int64

In [10]:
sales_df["sku"].value_counts().to_excel("data/df-sample-sales3-outB.xlsx")

### Tabulate 製表
Pandas有強大的選項來導出DataFrame。然而，我曾有過幾次想以更友好的ASCII方式顯示DataFrame的情況。我很驚喜地發現tabulate程序能夠理解pandas的DataFrames。

In [11]:
#pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [14]:
from tabulate import tabulate

print(tabulate(sales_df, tablefmt="fancy_grid"))

╒═══╤════════╤═════════════════════════════╤══════════╤════╤═══════╤═════════╤═════════════════════╤════╕
│ 0 │ 740150 │ Barton LLC                  │ B1-20000 │ 39 │ 86.69 │ 3380.91 │ 2014-01-01 07:21:51 │ B1 │
├───┼────────┼─────────────────────────────┼──────────┼────┼───────┼─────────┼─────────────────────┼────┤
│ 1 │ 714466 │ Trantow-Barrows             │ S2-77896 │ -1 │ 63.16 │  -63.16 │ 2014-01-01 10:00:47 │ S2 │
├───┼────────┼─────────────────────────────┼──────────┼────┼───────┼─────────┼─────────────────────┼────┤
│ 2 │ 218895 │ Kulas Inc                   │ B1-69924 │ 23 │ 90.7  │ 2086.1  │ 2014-01-01 13:24:58 │ B1 │
├───┼────────┼─────────────────────────────┼──────────┼────┼───────┼─────────┼─────────────────────┼────┤
│ 3 │ 307599 │ Kassulke, Ondricka and Metz │ S1-65481 │ 41 │ 21.05 │  863.05 │ 2014-01-01 15:05:22 │ S1 │
├───┼────────┼─────────────────────────────┼──────────┼────┼───────┼─────────┼─────────────────────┼────┤
│ 4 │ 412290 │ Jerde-Hilpert               │ S

In [17]:
headers = list(sales_df)
print(tabulate(sales_df, headers, tablefmt="fancy_grid"))

╒════╤══════════════════╤═════════════════════════════╤══════════╤════════════╤══════════════╤═════════════╤═════════════════════╤══════════════╕
│    │   account number │ name                        │ sku      │   quantity │   unit price │   ext price │ date                │ prod_group   │
╞════╪══════════════════╪═════════════════════════════╪══════════╪════════════╪══════════════╪═════════════╪═════════════════════╪══════════════╡
│  0 │           740150 │ Barton LLC                  │ B1-20000 │         39 │        86.69 │     3380.91 │ 2014-01-01 07:21:51 │ B1           │
├────┼──────────────────┼─────────────────────────────┼──────────┼────────────┼──────────────┼─────────────┼─────────────────────┼──────────────┤
│  1 │           714466 │ Trantow-Barrows             │ S2-77896 │         -1 │        63.16 │      -63.16 │ 2014-01-01 10:00:47 │ S2           │
├────┼──────────────────┼─────────────────────────────┼──────────┼────────────┼──────────────┼─────────────┼────────────────