In [32]:
import numpy as np
import pandas as pd

## 3.1 從無到有建立DataFrame

In [2]:
# 需要一些串列 作為欄位
fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]

# 使用以上串列 建立字典
people = {'first': fname, 'last': lname, 'birth': birth}

# 使用以上字典 建立DataFrame
beatles = pd.DataFrame(people)
print(beatles)

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [3]:
# 預設索引
beatles.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
# 自訂索引
print(pd.DataFrame(people, index=['a', 'b', 'c', 'd']))

     first       last  birth
a     Paul  McCartney   1942
b     John     Lennon   1940
c  Richard    Starkey   1940
d   George   Harrison   1943


In [5]:
# 用多個字典構成的串列建立DataFrame
print(pd.DataFrame([{'first':'Paul', 'last':'McCartney', 'birth':1942},
                    {'first':'John', 'last':'Lennon', 'birth':1940},
                    {'first':'Richard', 'last':'Starkey', 'birth':1940},
                    {'first':'George', 'last':'Harrison', 'birth':1943}]))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [6]:
# 用多個字典構成的串列建立DataFrame
print(pd.DataFrame([{'first':'Paul', 'last':'McCartney', 'birth':1942},
                    {'first':'John', 'last':'Lennon', 'birth':1940},
                    {'first':'Richard', 'last':'Starkey', 'birth':1940},
                    {'first':'George', 'last':'Harrison', 'birth':1943}],
                    columns=['last', 'first', 'birth']))

        last    first  birth
0  McCartney     Paul   1942
1     Lennon     John   1940
2    Starkey  Richard   1940
3   Harrison   George   1943


## 3.2 存取CSV 檔案

In [7]:
# 將DataFrame 以CSV 形式存入字串緩衝區
from io import StringIO
fout = StringIO()
beatles.to_csv(fout)

# 查看內容
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



In [8]:
# 從CSV 存取DataFrame
fout.seek(0) # 移至緩衝區的開頭位置
print(pd.read_csv(fout))

   Unnamed: 0    first       last  birth
0           0     Paul  McCartney   1942
1           1     John     Lennon   1940
2           2  Richard    Starkey   1940
3           3   George   Harrison   1943


In [9]:
# 使用 index_col=
fout.seek(0)
print(pd.read_csv(fout, index_col=0))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [10]:
# 也可以不要存入索引 index=False
fout = StringIO()
beatles.to_csv(fout, index=False)
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



## 3.3 讀取大型的CSV 檔案

In [11]:
# 限制載入資料量 nrows=
diamonds = pd.read_csv('../../data/diamonds.csv', nrows=1000)

print(diamonds)

     carat      cut color clarity  depth  table  price     x     y     z
0     0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1     0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2     0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3     0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4     0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
..     ...      ...   ...     ...    ...    ...    ...   ...   ...   ...
995   0.54    Ideal     D    VVS2   61.4   52.0   2897  5.30  5.34  3.26
996   0.72    Ideal     E     SI1   62.5   55.0   2897  5.69  5.74  3.57
997   0.72     Good     F     VS1   59.4   61.0   2897  5.82  5.89  3.48
998   0.74  Premium     D     VS2   61.8   58.0   2897  5.81  5.77  3.58
999   1.12  Premium     J     SI2   60.6   59.0   2898  6.68  6.61  4.03

[1000 rows x 10 columns]


In [12]:
# 查看占用多少記憶體
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.3+ KB


In [13]:
# 指定存入的資料型態
diamonds2 = pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y':np.float32, 'z':np.float32,
                               'price': np.int16})

diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [14]:
# 比對一下資料集內容
print(diamonds.describe())

             carat        depth        table       price            x  \
count  1000.000000  1000.000000  1000.000000  1000.00000  1000.000000   
mean      0.689280    61.722800    57.734700  2476.54000     5.605940   
std       0.195291     1.758879     2.467946   839.57562     0.625173   
min       0.200000    53.000000    52.000000   326.00000     3.790000   
25%       0.700000    60.900000    56.000000  2777.00000     5.640000   
50%       0.710000    61.800000    57.000000  2818.00000     5.770000   
75%       0.790000    62.600000    59.000000  2856.00000     5.920000   
max       1.270000    69.500000    70.000000  2898.00000     7.120000   

                 y            z  
count  1000.000000  1000.000000  
mean      5.599180     3.457530  
std       0.611974     0.389819  
min       3.750000     2.270000  
25%       5.630000     3.450000  
50%       5.760000     3.550000  
75%       5.910000     3.640000  
max       7.050000     4.330000  


In [15]:
print(diamonds2.describe())

             carat        depth        table       price            x  \
count  1000.000000  1000.000000  1000.000000  1000.00000  1000.000000   
mean      0.689280    61.722801    57.734699  2476.54000     5.605940   
std       0.195291     1.758879     2.467946   839.57562     0.625173   
min       0.200000    53.000000    52.000000   326.00000     3.790000   
25%       0.700000    60.900002    56.000000  2777.00000     5.640000   
50%       0.710000    61.799999    57.000000  2818.00000     5.770000   
75%       0.790000    62.599998    59.000000  2856.00000     5.920000   
max       1.270000    69.500000    70.000000  2898.00000     7.120000   

                 y            z  
count  1000.000000  1000.000000  
mean      5.599180     3.457530  
std       0.611974     0.389819  
min       3.750000     2.270000  
25%       5.630000     3.450000  
50%       5.760000     3.550000  
75%       5.910000     3.640000  
max       7.050000     4.330000  


In [16]:
# 檢視類別型別的相異資料數
diamonds2.cut.value_counts()

cut
Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: count, dtype: int64

In [17]:
# 檢視類別型別的相異資料數
diamonds2.color.value_counts()

color
E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: count, dtype: int64

In [18]:
# 檢視類別型別的相異資料數
diamonds2.clarity.value_counts()

clarity
SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: count, dtype: int64

In [19]:
# 轉換為categorical dtype
diamonds3 = pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y':np.float32, 'z':np.float32,
                               'price': np.int16,
                               'cut':'category', 'color': 'category',
                               'clarity': 'category'})

diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


In [20]:
# 只載入特定欄位
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']

diamonds4= pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'price': np.int16,
                               'cut':'category', 'color': 'category',
                               'clarity': 'category'},
                        usecols=cols)
diamonds4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
dtypes: category(3), float32(3), int16(1)
memory usage: 17.6 KB


In [25]:
# 批次處理
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']

diamonds_iter = pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'price': np.int16,
                               'cut':'category', 'color': 'category',
                               'clarity': 'category'},
                        usecols=cols,
                        chunksize=200)

# 處理方式
def process(df):
    return f'processed {df.size} items'


for chunk in diamonds_iter:
    print(process(chunk))

processed 1400 items
processed 1400 items
processed 1400 items
processed 1400 items
processed 1400 items


In [None]:
# 查看各型別範圍
# 整數
print(np.iinfo(np.int8))

# 浮點數
print(np.finfo(np.float16))

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
smallest_normal = 6.10352e-05   smallest_subnormal = 5.96046e-08
---------------------------------------------------------------



In [30]:
# 查看price欄位的最大最小值
print(diamonds4['price'].min())
print(diamonds4['price'].max())

326
2898


In [31]:
# price欄位記憶體佔用空間
print(diamonds.price.memory_usage())

# 去除索引
print(diamonds.price.memory_usage(index=False))

# 查詢object 型別
print(diamonds.cut.memory_usage(deep=True))

8132
8000
55465


In [34]:
# 存成feather
diamonds4.to_feather('d.arr')
diamonds5 = pd.read_feather('d.arr')

In [35]:
# 存成parquet
diamonds4.to_parquet('d.pqt')

## 3.4 使用Excel 檔案

In [38]:
# 匯出Excel 檔案
beatles.to_excel('beat.xlsx')

In [40]:
# 讀取excel 檔案
beat2 = pd.read_excel('beat.xlsx')
print(beat2)

   Unnamed: 0    first       last  birth
0           0     Paul  McCartney   1942
1           1     John     Lennon   1940
2           2  Richard    Starkey   1940
3           3   George   Harrison   1943


In [41]:
# 讀取excel 檔案 指定索引
beat2 = pd.read_excel('beat.xlsx', index_col=0)
print(beat2)

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [42]:
# 檢查資料型態有沒有被正確還原
beat2.dtypes

first    object
last     object
birth     int64
dtype: object

In [43]:
# 多個工作表並命名
with pd.ExcelWriter('beat.xlsx') as xl_writer:
    beatles.to_excel(xl_writer, sheet_name='All')
    beatles[beatles.birth < 1941].to_excel(xl_writer, sheet_name='1940')

# 如果不用with 的話 最後要加上
# xl_writer.save()

## 3.5 讀取ZIP 檔案中的資料

In [10]:
# 讀取ZIP檔
autos = pd.read_csv('../../data/vehicles.csv.zip')
print(autos.head())

   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  \
0  15.695714         0.0        0.0        0.0      19      0.0        0   
1  29.964545         0.0        0.0        0.0       9      0.0        0   
2  12.207778         0.0        0.0        0.0      23      0.0        0   
3  29.964545         0.0        0.0        0.0      10      0.0        0   
4  17.347895         0.0        0.0        0.0      17      0.0        0   

   cityA08U  cityCD  cityE  ...  mfrCode  c240Dscr  charge240b  c240bDscr  \
0       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
1       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
2       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
3       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
4       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   

                      createdOn                    modifiedOn  startStop  \
0  T

In [6]:
import warnings

warnings.filterwarnings("ignore")

In [11]:
# 先讀取再改datetime
print(autos.modifiedOn)

print(pd.to_datetime(autos.modifiedOn))

0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
39096    Tue Jan 01 00:00:00 EST 2013
39097    Tue Jan 01 00:00:00 EST 2013
39098    Tue Jan 01 00:00:00 EST 2013
39099    Tue Jan 01 00:00:00 EST 2013
39100    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 39101, dtype: object
0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]


In [12]:
# 讀檔的時候直接parse
autos = pd.read_csv('../../data/vehicles.csv.zip', parse_dates=['modifiedOn'])
print(autos.modifiedOn)

0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]


In [13]:
# 檢視zip檔內容
import zipfile
with zipfile.ZipFile('../../data/kaggle-survey-2018.zip') as z:
    # 印出所有文件
    print('\n'.join(z.namelist()))
    # 讀取指定檔案
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_question = kag.iloc[0]
    survey = kag.iloc[1:]

multipleChoiceResponses.csv
freeFormResponses.csv
SurveySchema.csv


In [15]:
# 檢視讀取內容
print(survey.head(2).T)

                                                            1          2
Time from Start to Finish (seconds)                       710        434
Q1                                                     Female       Male
Q1_OTHER_TEXT                                              -1         -1
Q2                                                      45-49      30-34
Q3                                   United States of America  Indonesia
...                                                       ...        ...
Q50_Part_5                                                NaN        NaN
Q50_Part_6                                                NaN        NaN
Q50_Part_7                                                NaN        NaN
Q50_Part_8                                                NaN        NaN
Q50_OTHER_TEXT                                             -1         -1

[395 rows x 2 columns]


## 3.6 存取資料庫

In [None]:
# 創建資料庫
import mysql.connector

# 連接 MySQL
conn = mysql.connector.connect(
    host='localhost',        
    user='your_username',    
    password='your_password'
)

cursor = conn.cursor()

# 創建資料庫
cursor.execute("CREATE DATABASE IF NOT EXISTS beat")

conn.commit()
cursor.close()
conn.close()

print("資料庫 'beat' 創建完成")

資料庫 'beat' 創建完成


In [None]:
# 連線到剛剛創建的資料庫
conn = mysql.connector.connect(
    host='localhost',        
    user='your_username',    
    password='your_password'
)

cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS Band''')

cur.execute('''CREATE TABLE Band(id INTEGER PRIMARY KEY,
               fname TEXT, lname TEXT, birthyear INT)''')

cur.execute('''INSERT INTO Band VALUES(
               0, 'Paul', 'McCartney', 1942)''')

cur.execute('''INSERT INTO Band VALUES(
               1, 'John', 'Lennon', 1940)''')

conn.commit()

cursor.close()
conn.close()

print("資料已存入資料庫 'beat' 中的 'Band' 表格")

資料已存入資料庫 'beat' 中的 'Band' 表格


In [None]:
# 直接用Pandas 就可以讀取sql資料
conn = mysql.connector.connect(
    host='localhost',        
    user='your_username',    
    password='your_password',
    database='my_database'
)

sql = '''SELECT * FROM Band'''
print(pd.read_sql(sql,conn,index_col='id'))
conn.close()

   fname      lname  birthyear
id                            
0   Paul  McCartney       1942
1   John     Lennon       1940


In [None]:
# 用SQLAlchemy
import sqlalchemy as sa
engine = sa.create_engine('mysql+mysqlconnector://your_username:your_password@localhost/your_database')
sa_connection = engine.connect()
beat = pd.read_sql('''SELECT * FROM Band''', sa_connection, index_col='id')

print(beat)

sa_connection.close()

   fname      lname  birthyear
id                            
0   Paul  McCartney       1942
1   John     Lennon       1940


## 3.7 存取JSON 格式的資料

In [37]:
# 將字典轉為json 格式
import json
encoded = json.dumps(people)
print(encoded)

{"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}


In [5]:
# 將json 物件 轉換為原始的物件格式
print(json.loads(encoded))

{'first': ['Paul', 'John', 'Richard', 'George'], 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'], 'birth': [1942, 1940, 1940, 1943]}


In [9]:
# 轉為dataframe
from io import StringIO

beatles = pd.read_json(StringIO(encoded))
print(beatles)

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [10]:
# 轉換成列形式的json
records = beatles.to_json(orient='records')
print(records)

[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"first":"Richard","last":"Starkey","birth":1940},{"first":"George","last":"Harrison","birth":1943}]


In [15]:
# 用 records 讀取json
print(pd.read_json(StringIO(records), orient='records'))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [16]:
# split
split = beatles.to_json(orient='split')
print(split)

{"columns":["first","last","birth"],"index":[0,1,2,3],"data":[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]}


In [17]:
# 用 split 讀取json
print(pd.read_json(StringIO(records), orient='split'))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [18]:
# index
index = beatles.to_json(orient='index')
print(index)

{"0":{"first":"Paul","last":"McCartney","birth":1942},"1":{"first":"John","last":"Lennon","birth":1940},"2":{"first":"Richard","last":"Starkey","birth":1940},"3":{"first":"George","last":"Harrison","birth":1943}}


In [20]:
# 用 index 讀取json
print(pd.read_json(StringIO(index), orient='index'))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [21]:
# values
values = beatles.to_json(orient='values')
print(values)

[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]


In [22]:
# 用 values 讀取json
print(pd.read_json(StringIO(values), orient='values'))

         0          1     2
0     Paul  McCartney  1942
1     John     Lennon  1940
2  Richard    Starkey  1940
3   George   Harrison  1943


In [23]:
# 使用enumerate 就可以還原欄位名稱
print(pd.read_json(StringIO(values), orient='values')
      .rename (columns=dict(enumerate(['first', 'last', 'birth']))))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [31]:
# table
table = beatles.to_json(orient='table')
print(table)

{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"first","type":"string"},{"name":"last","type":"string"},{"name":"birth","type":"integer"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"first":"Paul","last":"McCartney","birth":1942},{"index":1,"first":"John","last":"Lennon","birth":1940},{"index":2,"first":"Richard","last":"Starkey","birth":1940},{"index":3,"first":"George","last":"Harrison","birth":1943}]}


In [33]:
# 用 table 讀取json
print(pd.read_json(StringIO(table), orient='table'))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [41]:
# 將json字串轉換成字典
output = json.loads(encoded)

# 新增資料後再轉回去json 格式
output['version'] = '0.4.1'
json.dumps(output)

'{"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943], "version": "0.4.1"}'

## 3.8 讀取HTML 表格

In [42]:
# 抓wiki 中 beatles 頁面的資料
url = 'https://en.wikipedia.org/wiki/The_Beatles_albums_discography'
dfs = pd.read_html(url)
len(dfs)

46

In [44]:
# 檢視第一個dataframe
print(dfs[0])

                      The Beatles albums discography  \
0  The Beatles members Ringo Starr, Paul McCartne...   
1                                      Studio albums   
2                                                EPs   
3                                        Live albums   
4                                 Compilation albums   
5                                           Mash-ups   
6                                           Box sets   

                    The Beatles albums discography.1  
0  The Beatles members Ringo Starr, Paul McCartne...  
1                                   12 (UK), 17 (US)  
2                                                 36  
3                                                  5  
4                                                 49  
5                                                  2  
6                                                 37  


In [45]:
# 使用match 限縮範圍
url = 'https://en.wikipedia.org/wiki/The_Beatles_albums_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='-')
len(dfs)

2

In [46]:
# 檢視其中一個表格的欄位名稱
print(dfs[0].columns)

MultiIndex([(               'Title',            'Title'),
            (    'Album details[A]', 'Album details[A]'),
            ('Peak chart positions',        'UK [8][9]'),
            ('Peak chart positions',         'AUS [10]'),
            ('Peak chart positions',         'CAN [11]'),
            ('Peak chart positions',         'FRA [12]'),
            ('Peak chart positions',         'GER [13]'),
            ('Peak chart positions',         'NOR [14]'),
            ('Peak chart positions',      'US [15][16]'),
            (      'Certifications',   'Certifications'),
            (               'Sales',            'Sales')],
           )


In [51]:
# 查看表格內容
print(dfs[0])

                                                Title  \
                                                Title   
0                                    Please Please Me   
1                                 With the Beatles[B]   
2                                  A Hard Day's Night   
3                                    Beatles for Sale   
4                                               Help!   
5                                         Rubber Soul   
6                                            Revolver   
7               Sgt. Pepper's Lonely Hearts Club Band   
8                     The Beatles ("The White Album")   
9                                 Yellow Submarine[C]   
10                                         Abbey Road   
11                                          Let It Be   
12  "—" denotes that the recording did not chart o...   

                                     Album details[A]  \
                                     Album details[A]   
0           Released: 22 March

In [None]:
# 使用attrs 參數
url = 'https://github.com/mattharrison/datasets/blob/master/data/anscombes.csv'
dfs = pd.read_html(url, attrs={'data-testid': 'csv-data'})
len(dfs)