In [1]:
import numpy as np
import pandas as pd

In [14]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)

# 也可以用python的中括號取值
print(data[1])

print(data[1:3])

# values 會變成我們習慣的NumPy 形式
print(data.values)
# index 可以取得索引
print(data.index)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
0.5
1    0.50
2    0.75
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [6]:
# Pandas 的 Series 物件可以定義任何資料型態的索引
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a','b','c','d'])

print(data)

print(data['b'])

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=[2, 5, 3, 7])

print(data)

print(data[5])


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64
0.5


In [7]:
# 使用python的字典來建立Series 物件 
population_dict = {'California': 39538223, 'Texas': 29145505,'Florida': 21538187, 
                   'New York': 20201249, 'Pennsylvania': 13002700}

population = pd.Series(population_dict)

print(population)

print(population['California'])

print(population['California':'Florida'])

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64
39538223
California    39538223
Texas         29145505
Florida       21538187
dtype: int64


In [8]:
# 如果data 是串列或NumPy 陣列 沒有指定index 則預設index 為整數
print(pd.Series([2, 4, 6]))

# 或是填入重複的純量
print(pd.Series(5, index=[100, 200, 300]))

# 如果data 直接是字典 則會使用字典的鍵值做為index
print(pd.Series({2:'a', 1:'b', 3:'c'}))

# 可以使用index 的順序 來使用鍵的順序或取得子集合
print(pd.Series({2:'a', 1:'b', 3:'c'}, index=[1, 2]))

0    2
1    4
2    6
dtype: int64
100    5
200    5
300    5
dtype: int64
2    a
1    b
3    c
dtype: object
1    b
2    a
dtype: object


In [9]:
population_dict = {'California': 39538223, 'Texas': 29145505,'Florida': 21538187, 
                   'New York': 20201249, 'Pennsylvania': 13002700}

population = pd.Series(population_dict)

area_dict = {'California': 423967, 'Texas': 695662,'Florida': 170312, 
             'New York': 141297, 'Pennsylvania': 119280}

area = pd.Series(area_dict)

states = pd.DataFrame({'population':population,
                       'area':area})

print(population)
print(area)
print(states)

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64
California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64
              population    area
California      39538223  423967
Texas           29145505  695662
Florida         21538187  170312
New York        20201249  141297
Pennsylvania    13002700  119280


In [13]:
# DataFrame 取值的方式多了一個 .column 屬性
print(states.index)
print(states.values)
print(states.columns)

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')
[[39538223   423967]
 [29145505   695662]
 [21538187   170312]
 [20201249   141297]
 [13002700   119280]]
Index(['population', 'area'], dtype='object')


In [20]:
print(states['area'])

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64


In [21]:
# 從一個Series 物件 可以建立單欄的DataFrame
print(pd.DataFrame(population, columns=['population']))

# 從一個字典串列 使用串列生成式
data = [{'a': i, 'b': 2 * i } for i in range(3)]
print(pd.DataFrame(data))

# 缺少鍵的話 對應的值會自動補上NaN
print(pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]))

# 使用Series 物件的字典建立
print(pd.DataFrame({'population':population,'area':area}))

# 使用二維陣列的 NumPy來建立 但是要自己另外設定欄位名稱跟索引 若沒給預設為整數
print(pd.DataFrame(np.random.rand(3,2),
                   columns=['foo', 'bar'],
                   index = ['a', 'b', 'c']))

# 使用 NumPy 的結構陣列 (本質與DataFrame十分相似)
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)
print(pd.DataFrame(A))

              population
California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
   a  b
0  0  0
1  1  2
2  2  4
     a  b    c
0  1.0  2  NaN
1  NaN  3  4.0
              population    area
California      39538223  423967
Texas           29145505  695662
Florida         21538187  170312
New York        20201249  141297
Pennsylvania    13002700  119280
        foo       bar
a  0.313144  0.052756
b  0.705255  0.583052
c  0.598429  0.239802
[(0, 0.) (0, 0.) (0, 0.)]
   A    B
0  0  0.0
1  0  0.0
2  0  0.0


In [22]:
# Pandas 的 Index 物件
ind = pd.Index([2, 3, 5, 7, 11])

print(ind)

Index([2, 3, 5, 7, 11], dtype='int64')


In [23]:
ind = pd.Index([2, 3, 5, 7, 11])

# index 像陣列一樣 可以使用slice 取值
print(ind[1])
print(ind[::2])

# index 的屬性也跟陣列很像
print(ind.size, ind.shape, ind.ndim, ind.dtype)

# index 不能修改
ind[1] = 0

3
Index([2, 5, 11], dtype='int64')
5 (5,) 1 int64


TypeError: Index does not support mutable operations

In [24]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

# 交集
print(indA.intersection(indB))
# 聯集
print(indA.union(indB))
# 差集
print(indA.symmetric_difference(indB))

Index([3, 5, 7], dtype='int64')
Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Index([1, 2, 9, 11], dtype='int64')
