In [1]:
import numpy as np
import pandas as pd

### 1.1 選取DataFrame 的欄位

In [None]:
# 使用欄位名稱 從DataFrame 中取得一個Series 物件
movies = pd.read_csv('../../data/movie.csv')
movies['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [3]:
# 使用屬性 從DataFrame 中取得一個Series 物件
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [4]:
# loc 基於標籤選取
movies.loc[ : , 'director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [5]:
# iloc 基於位置選取
movies.iloc[ : , 1]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [6]:
# 查詢Series 物件的相關資訊
print(movies.loc[ : , 'director_name'].index)
print(movies.loc[ : , 'director_name'].dtype)
print(movies.loc[ : , 'director_name'].size)
print(movies.loc[ : , 'director_name'].name)

RangeIndex(start=0, stop=4916, step=1)
object
4916
director_name


In [7]:
# 查詢傳回物件
type(movies.loc[ : , 'director_name'])

pandas.core.series.Series

In [8]:
# 查詢詳細的資料型別
movies.loc[ : , 'director_name'].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

### 1.2 呼叫Series 的方法(method)

In [4]:
# 查看 Series 方法總數
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

421

In [5]:
# 查看 Pandas 方法總數
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

439

In [6]:
# 查看共用的方法
len(s_attr_methods & df_attr_methods)

364

In [None]:
# 切出Series 物件
movies = pd.read_csv('../../data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

# 檢查資料型別
print(director.dtype)
print(fb_likes.dtype)

object
float64


In [9]:
# 檢查一下資料
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [10]:
# 隨機提取5列資料
director.sample(n=5, random_state=42)

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [12]:
# 檢視fb_likes
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [None]:
# 類別資料次數計算
director.value_counts()

director_name
Steven Spielberg      26
Woody Allen           22
Clint Eastwood        20
Martin Scorsese       20
Spike Lee             16
                      ..
Mike Bruce             1
Bradley Rust Gray      1
Collin Joseph Neal     1
Kirk Loudon            1
Kevin Jordan           1
Name: count, Length: 2397, dtype: int64

In [14]:
# 數值資料次數計算
fb_likes.value_counts()

actor_1_facebook_likes
1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
375.0        1
318.0        1
630.0        1
10.0         1
291.0        1
Name: count, Length: 877, dtype: int64

In [15]:
# 元素個數
director.size

4916

In [16]:
# 陣列形狀
director.shape

(4916,)

In [17]:
# 去除重複值
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'],
      shape=(2398,), dtype=object)

In [19]:
# 非缺失值個數
print(director.count())
print(fb_likes.count())

4814
4909


In [20]:
# 數值統計
print(fb_likes.min())
print(fb_likes.max())
print(fb_likes.mean())
print(fb_likes.median())
print(fb_likes.std())

0.0
640000.0
6494.488490527602
982.0
15106.986883848185


In [21]:
# 數值統計 describe()
fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [22]:
# 類別統計 describe()
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [23]:
# 分位數 輸入純量
fb_likes.quantile(.2)

np.float64(510.0)

In [24]:
# 分位數 輸入list
fb_likes.quantile([.1, .2, .3, .4, .5])

0.1    240.0
0.2    510.0
0.3    694.0
0.4    854.0
0.5    982.0
Name: actor_1_facebook_likes, dtype: float64

In [25]:
# 尋找缺失值
director.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [26]:
# 填補缺失值
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

np.int64(4916)

In [28]:
# 刪除缺失值
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

4909

In [29]:
# value.counts() 可以使用相對頻率
director.value_counts(normalize=True)

director_name
Steven Spielberg      0.005401
Woody Allen           0.004570
Clint Eastwood        0.004155
Martin Scorsese       0.004155
Spike Lee             0.003324
                        ...   
Mike Bruce            0.000208
Bradley Rust Gray     0.000208
Collin Joseph Neal    0.000208
Kirk Loudon           0.000208
Kevin Jordan          0.000208
Name: proportion, Length: 2397, dtype: float64

In [30]:
# 更直接顯示是否有缺失值的方式
director.hasnans

True

In [31]:
# 非缺失值傳回True
director.notna()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

### 1.3 Series 的相關操作

In [2]:
# 切出Series 物件
movies = pd.read_csv('../../data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [3]:
# 向量式操作 自動每個都+1
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [4]:
# 且向量式操作 不會動到原始資料
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [5]:
# 比較算符會傳回布林陣列
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [6]:
# 比較算符也可用在類別型別上
director = movies['director_name']
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [9]:
# 算符可以使用方法 因為方法可以進行更多操作 例如直接補缺失值
money = pd.Series([100, 20, None])
print(money - 15,'\n')

print(money.sub(15, fill_value=0))

0    85.0
1     5.0
2     NaN
dtype: float64 

0    85.0
1     5.0
2   -15.0
dtype: float64


### 1.4 串連Series 的方法

In [10]:
# 切出Series 物件
movies = pd.read_csv('../../data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [11]:
# 最常見的方法串連
director.value_counts().head(3)

director_name
Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: count, dtype: int64

In [12]:
# 計算缺失值總數 (True 為 1  False 為 0)
fb_likes.isna().sum()

np.int64(7)

In [13]:
# 有缺失值的資料 都會被轉為float
fb_likes.dtype

dtype('float64')

In [14]:
# 通常會以垂直排列增加可讀性
(fb_likes.fillna(0)
         .astype(int)
         .head()
)

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [15]:
# 垂直排列的好處是 比較好debug
(fb_likes.fillna(0)
        #  .astype(int)
        #  .head()
)

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [16]:
# 可以一行一行確認
(fb_likes.fillna(0)
         .astype(int)
        #  .head()
)

0        1000
1       40000
2       11000
3       27000
4         131
        ...  
4911      637
4912      841
4913        0
4914      946
4915       86
Name: actor_1_facebook_likes, Length: 4916, dtype: int64

In [17]:
# 另一個方法是使用pipe() 放自定義函式
def debug_ser(ser):
    print('BEFORE')
    print(ser)
    print('AFTER')
    return ser

(fb_likes.fillna(0)
         .pipe(debug_ser)
         .astype(int)
         .head()
)

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [18]:
# 也可以設定一個全域變數存放中繼值
intermediate = None

def debug_ser(df):
    global intermediate
    intermediate = df
    return df

(fb_likes.fillna(0)
         .pipe(debug_ser)
         .astype(int)
         .head()
)
intermediate

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

### 1.5 更改欄位名稱

In [20]:
# 讀入資料集
movies = pd.read_csv('../../data/movie.csv')

# 建立欄位對應名稱字典
col_map = {'director_name':'Director Name'}

# 使用rename()
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [21]:
# 也可以拿來改索引
# 先指定一個欄位為索引
idx_map = {'Avatar': 'Ratava',
           'Spectre': 'Ertceps',
           "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
           'movie_facebook_likes': 'fblikes'}

(movies
    .set_index('movie_title')
    .rename(index=idx_map, columns=col_map)
    .head(3)
)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [24]:
# 在讀檔時就設定index
movies = pd.read_csv('../../data/movie.csv', index_col='movie_title')

# 將index column 轉為list
ids = movies.index.tolist()
columns = movies.columns.tolist()

# 改list 內容
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[0] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'

# 將list 指派回index 與 column
movies.index = ids
movies.columns = columns
movies.head(3)


Unnamed: 0,director,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [27]:
# 用自訂函式傳入rename
# 在讀檔時就設定index
movies = pd.read_csv('../../data/movie.csv', index_col='movie_title')

def to_clean(val):
    return val.strip().replace('_', '.')

movies.rename(columns=to_clean).head(3)

Unnamed: 0_level_0,color,director.name,num.critic.for.reviews,duration,director.facebook.likes,actor.3.facebook.likes,actor.2.name,actor.1.facebook.likes,gross,genres,...,num.user.for.reviews,language,country,content.rating,budget,title.year,actor.2.facebook.likes,imdb.score,aspect.ratio,movie.facebook.likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [None]:
# 也可以直接用串列生成式改 但會直接修改原始資料
cols = [col.strip().replace('_','.')
        for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0_level_0,color,director.name,num.critic.for.reviews,duration,director.facebook.likes,actor.3.facebook.likes,actor.2.name,actor.1.facebook.likes,gross,genres,...,num.user.for.reviews,language,country,content.rating,budget,title.year,actor.2.facebook.likes,imdb.score,aspect.ratio,movie.facebook.likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


### 1.6 新增及刪除欄位

In [15]:
# 新增純量
movies = pd.read_csv('../../data/movie.csv')
movies['has_seen'] = 0
movies.head(3)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0


In [16]:
# 使用assign
col_map = {'aspect_ratio': 'aspect',
           'movie_facebook_likes': 'fblikes'}

(movies
    .rename(columns=col_map)
    .assign(has_seen=0)
)
movies.head(3)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0


In [17]:
# 用 + 加總所有fb 按讚數的相關欄位
total = (movies['actor_1_facebook_likes'] + 
         movies['actor_2_facebook_likes'] +
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])

total.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [18]:
# 使用fancy index 再使用sum()加總
cols = ['actor_1_facebook_likes', 'actor_2_facebook_likes',
        'actor_3_facebook_likes', 'director_facebook_likes']
sum_col = movies[cols].sum(axis='columns')

movies.assign(total_likes=sum_col).head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,total_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,274.0


In [19]:
# 也可以寫函式
def sum_likes(df):
    return df[[c for c in df.columns
               if 'like' in c]].sum(axis=1)

movies.assign(total_likes=sum_likes).head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,total_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,40625.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,94913.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,108254.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,365759.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,417.0


In [20]:
# 用 + 運算會有缺失值
(movies
    .assign(total_likes=total)
    ['total_likes']
    .isna()
    .sum()
)

np.int64(122)

In [21]:
# 計算剛剛上述的按讚數佔全體卡司的比例
# 先驗證全體卡司按讚數是否都大於上述計算值
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= df['total_likes']

df2 = (movies.assign(total_likes=total,
                     is_cast_likes_more = cast_like_gt_actor_director))

# all() 可以確認是否全為True 
df2['is_cast_likes_more'].all()

np.False_

In [None]:
# 卡司的按讚數 可能不包含導演的按讚數
# 重新計算total likes 只含主要演員
df2 = df2.drop(columns='total_likes')

actor_sum = (movies[[c for c in movies.columns if 'actor_' in c and '_likes' in c]]
             .sum(axis='columns'))

actor_sum.head(5)

0     2791.0
1    46000.0
2    11554.0
3    73000.0
4      143.0
dtype: float64

In [None]:
# 再次檢查
movies['cast_total_facebook_likes'].ge(actor_sum).all()

np.True_

In [None]:
# 計算百分比
pct_like = (actor_sum
            .div(movies['cast_total_facebook_likes']))

# 檢查是否都在0 和 1 之間
pct_like.describe()

count    4883.000000
mean        0.833279
std         0.140566
min         0.300767
25%         0.735284
50%         0.869289
75%         0.954774
max         1.000000
dtype: float64

In [25]:
# 建立一個Series 使用剛剛算出的值
pd.Series(pct_like.values,
          index=movies['movie_title'].values).head()

Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    1.000000
dtype: float64

In [29]:
# 找出要插入的位置
profit_index = movies.columns.get_loc('gross') + 1

# 插入新欄位
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])

movies

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,profit,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,523505847.0,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,9404152.0,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,-44925825.0,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,198130642.0,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,,...,,,,,,12.0,7.1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,,...,English,Canada,,,2013.0,470.0,7.7,,84,0
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,0
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,0
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,0


In [None]:
# 刪除欄位的另一種方式
del movies['profit']
movies

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,English,Canada,,,2013.0,470.0,7.7,,84,0
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,0
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,0
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,0
