In [1]:
import numpy as np
import pandas as pd

### 1.1 選取DataFrame 的欄位

In [None]:
# 使用欄位名稱 從DataFrame 中取得一個Series 物件
movies = pd.read_csv('../../data/movie.csv')
movies['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [3]:
# 使用屬性 從DataFrame 中取得一個Series 物件
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [4]:
# loc 基於標籤選取
movies.loc[ : , 'director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [5]:
# iloc 基於位置選取
movies.iloc[ : , 1]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [6]:
# 查詢Series 物件的相關資訊
print(movies.loc[ : , 'director_name'].index)
print(movies.loc[ : , 'director_name'].dtype)
print(movies.loc[ : , 'director_name'].size)
print(movies.loc[ : , 'director_name'].name)

RangeIndex(start=0, stop=4916, step=1)
object
4916
director_name


In [7]:
# 查詢傳回物件
type(movies.loc[ : , 'director_name'])

pandas.core.series.Series

In [8]:
# 查詢詳細的資料型別
movies.loc[ : , 'director_name'].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

### 1.2 呼叫Series 的方法(method)

In [4]:
# 查看 Series 方法總數
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

421

In [5]:
# 查看 Pandas 方法總數
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

439

In [6]:
# 查看共用的方法
len(s_attr_methods & df_attr_methods)

364

In [None]:
# 切出Series 物件
movies = pd.read_csv('../../data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

# 檢查資料型別
print(director.dtype)
print(fb_likes.dtype)

object
float64


In [9]:
# 檢查一下資料
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [10]:
# 隨機提取5列資料
director.sample(n=5, random_state=42)

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [12]:
# 檢視fb_likes
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [None]:
# 類別資料次數計算
director.value_counts()

director_name
Steven Spielberg      26
Woody Allen           22
Clint Eastwood        20
Martin Scorsese       20
Spike Lee             16
                      ..
Mike Bruce             1
Bradley Rust Gray      1
Collin Joseph Neal     1
Kirk Loudon            1
Kevin Jordan           1
Name: count, Length: 2397, dtype: int64

In [14]:
# 數值資料次數計算
fb_likes.value_counts()

actor_1_facebook_likes
1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
375.0        1
318.0        1
630.0        1
10.0         1
291.0        1
Name: count, Length: 877, dtype: int64

In [15]:
# 元素個數
director.size

4916

In [16]:
# 陣列形狀
director.shape

(4916,)

In [17]:
# 去除重複值
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'],
      shape=(2398,), dtype=object)

In [19]:
# 非缺失值個數
print(director.count())
print(fb_likes.count())

4814
4909


In [20]:
# 數值統計
print(fb_likes.min())
print(fb_likes.max())
print(fb_likes.mean())
print(fb_likes.median())
print(fb_likes.std())

0.0
640000.0
6494.488490527602
982.0
15106.986883848185


In [21]:
# 數值統計 describe()
fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [22]:
# 類別統計 describe()
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [23]:
# 分位數 輸入純量
fb_likes.quantile(.2)

np.float64(510.0)

In [24]:
# 分位數 輸入list
fb_likes.quantile([.1, .2, .3, .4, .5])

0.1    240.0
0.2    510.0
0.3    694.0
0.4    854.0
0.5    982.0
Name: actor_1_facebook_likes, dtype: float64

In [25]:
# 尋找缺失值
director.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [26]:
# 填補缺失值
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

np.int64(4916)

In [28]:
# 刪除缺失值
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

4909

In [29]:
# value.counts() 可以使用相對頻率
director.value_counts(normalize=True)

director_name
Steven Spielberg      0.005401
Woody Allen           0.004570
Clint Eastwood        0.004155
Martin Scorsese       0.004155
Spike Lee             0.003324
                        ...   
Mike Bruce            0.000208
Bradley Rust Gray     0.000208
Collin Joseph Neal    0.000208
Kirk Loudon           0.000208
Kevin Jordan          0.000208
Name: proportion, Length: 2397, dtype: float64

In [30]:
# 更直接顯示是否有缺失值的方式
director.hasnans

True

In [31]:
# 非缺失值傳回True
director.notna()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool