<a href="https://colab.research.google.com/github/hank199599/data_science_from_scratch_reading_log/blob/main/Chapter23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 推薦系統
根據資料做出某種**建議**或是**推薦**

In [3]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

### 目標
根據使用者目前的興趣，向他推薦其他的新興趣

# 土法煉鋼的作法
在有限資料下可以輕易推薦興趣  
隨著資料量上升，這種做法會變得越來越困難

# 推薦最流行的

In [4]:
from collections import Counter

popular_interests = Counter(interest for user_interests in users_interests for interest in user_interests)

print(popular_interests)

Counter({'Python': 4, 'R': 4, 'Big Data': 3, 'HBase': 3, 'Java': 3, 'statistics': 3, 'regression': 3, 'probability': 3, 'Hadoop': 2, 'Cassandra': 2, 'MongoDB': 2, 'Postgres': 2, 'scikit-learn': 2, 'statsmodels': 2, 'pandas': 2, 'machine learning': 2, 'libsvm': 2, 'C++': 2, 'neural networks': 2, 'deep learning': 2, 'artificial intelligence': 2, 'Spark': 1, 'Storm': 1, 'NoSQL': 1, 'scipy': 1, 'numpy': 1, 'decision trees': 1, 'Haskell': 1, 'programming languages': 1, 'mathematics': 1, 'theory': 1, 'Mahout': 1, 'MapReduce': 1, 'databases': 1, 'MySQL': 1, 'support vector machines': 1})


透過這個計算，可以單純向使用者推薦最受大家歡迎，而他目前還沒有興趣

In [5]:
from typing import List,Tuple

def most_popular_new_interests(user_interests:List[str],max_results:int=5)->List[Tuple[str,int]]:
  suggestions = [(interest,frequency) for interest,frequency in popular_interests.most_common() if interest not in users_interests]

  return suggestions[:max_results]

1號使用者

In [6]:
most_popular_new_interests(["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],5)

[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3)]

3號使用者

In [7]:
most_popular_new_interests(["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],5)

[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3)]

# 以「人」為基礎進行協同篩選

採用「餘弦相似度」(cosine similarity)來測量兩個單詞向量的相似程度

## 解析式集合(set compprehension)
將每個不同的興趣集中起來，然後再把所有這些興趣放進一個列表，最後進行排序。


In [8]:
unique_interests = sorted({interest for user_interests in users_interests for interest in user_interests})

針對每個使用者，製作出一個由0和1組成的「興趣向量」  
根據 unique_interests 列表進行迭代操作：遇到使用者有興趣的項目設為1，其餘設為0

In [9]:
def make_user_interest_vector(user_interests:List[str]) ->List[str]:
  """
   根據使用者的興趣列表，製作出使用者的興趣向量
   如果unique_interests[i]出現在列表中，第i個元素就設為1，否則就設為0
  """
  return [1 if interest in user_interests else 0 for interest in unique_interests]

製作出一個使用者興趣向量列表：

In [10]:
user_interest_vectors = [make_user_interest_vector(user_interests) for user_interests in users_interests]
# print(user_interest_vectors)

計算出兩兩成對使用者相似度

In [11]:
import math
def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

In [12]:
user_similarities = [[cosine_similarity(interest_vector_i,interest_vector_j) for interest_vector_j in user_interest_vectors] for interest_vector_i in user_interest_vectors]
print(user_similarities)

[[1.0, 0.3380617018914066, 0.0, 0.0, 0.0, 0.1543033499620919, 0.0, 0.0, 0.1889822365046136, 0.5669467095138409, 0.0, 0.0, 0.0, 0.1690308509457033, 0.0], [0.3380617018914066, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0], [0.0, 0.0, 1.0, 0.18257418583505536, 0.0, 0.16666666666666666, 0.0, 0.20412414523193154, 0.0, 0.0, 0.23570226039551587, 0.0, 0.47140452079103173, 0.0, 0.0], [0.0, 0.0, 0.18257418583505536, 1.0, 0.22360679774997896, 0.3651483716701107, 0.4472135954999579, 0.0, 0.0, 0.0, 0.5163977794943222, 0.22360679774997896, 0.5163977794943222, 0.0, 0.2581988897471611], [0.0, 0.0, 0.0, 0.22360679774997896, 1.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5773502691896258], [0.1543033499620919, 0.0, 0.16666666666666666, 0.3651483716701107, 0.0, 1.0, 0.0, 0.0, 0.0, 0.20412414523193154, 0.23570226039551587, 0.20412414523193154, 0.47140452079103173, 0.0, 0.0], [0.0, 0.0, 0.0, 0.4472135954999579, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.2886751345948129, 0.25, 0.0, 0.0, 

In [13]:
# 0 和 9 這兩個使用者共同的興趣為 Hadoop、Java、Big Data
assert 0.56 < user_similarities[0][9] < 0.58 ,"有好幾個共同的興趣"
# 0 和 8 這兩個使用者只有一個共同的興趣為：Big Data
assert 0.18 < user_similarities[0][8] < 0.20 ,"只有一個共同的興趣"

針對給定的使用者，找出另一個與他最相似的使用者

In [14]:
def most_similar_users_to(user_id:int) ->List[Tuple[int,float]]:
  pairs = [(other_user_id,similarity) for other_user_id,similarity in enumerate(user_similarities[user_id]) if user_id != other_user_id and similarity > 0]

  return sorted(pairs,key= lambda pair:pair[-1],reverse=True)

In [15]:
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

運用這些資料：  
針對每個興趣，把其他所有使用者相似度的值全部加總起來

In [16]:
from collections import defaultdict

def user_based_suggestions(user_id:int,include_current_interests:bool=False):
  # 把相似度加總起來
  suggestions:Dict[str,float]= defaultdict(float)
  for other_user_id,similarity in most_similar_users_to(user_id):
    for interest in users_interests[other_user_id]:
      suggestions[interest] += similarity
  
  # 把他們轉換成一個排序過的序列
  suggestions = sorted(suggestions.items(),
             key=lambda pair:pair[-1], #權重
             reverse=True)
  
  # 接著排除掉已經有的興趣
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion,weight) for suggestion,weight in suggestions if suggestion not in users_interests[user_id]]

調用針對使用者0的推薦興趣

In [17]:
user_based_suggestions(0)

[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('Python', 0.1543033499620919),
 ('R', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('programming languages', 0.1543033499620919)]

若項目數量過大，會發生維度的詛咒

# 以物為基礎進行協同篩選
直接計算興趣之間的相似度  
直接根據每個使用者目前的興趣，找出其他類似的興趣

In [18]:
interest_user_matrix = [[user_interest_vector[j] for user_interest_vector in user_interest_vectors] for j,_ in enumerate(unique_interests)]
# print(interest_user_matrix)

運用餘弦相似度的概念，找出彼此間的相似度

In [19]:
interest_similarities = [[cosine_similarity(user_vector_i,user_vector_j) for user_vector_j in interest_user_matrix] for user_vector_i in interest_user_matrix]

In [20]:
def most_similar_interests_to(interest_id:int):
  similarities = interest_similarities[interest_id]
  pairs = [(unique_interests[other_interest_id],similarity) for other_interest_id,similarity in enumerate(similarities) if interest_id != other_interest_id and similarity > 0]

  return sorted(pairs,
          key = lambda pair:pair[-1],
          reverse=True
          )

找出與Big Data (興趣0) 最相似的興趣

In [21]:
most_similar_interests_to(0)

[('Hadoop', 0.8164965809277261),
 ('Java', 0.6666666666666666),
 ('MapReduce', 0.5773502691896258),
 ('Spark', 0.5773502691896258),
 ('Storm', 0.5773502691896258),
 ('Cassandra', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('HBase', 0.3333333333333333)]

針對使用者的每個興趣，將類似興趣的相似度累加起來  
建立相對應的推薦資料

In [22]:
def item_based_suggestions(user_id:int,include_current_interests:bool=False):
  # 把類似興趣的相似度累加起來
  suggestions = defaultdict(float)

  user_interest_vector = user_interest_vectors[user_id]
  for interest_id,is_interested in enumerate(user_interest_vector):
    if is_interested == 1:
      similar_interests = most_similar_interests_to(interest_id)
      for interest,similarity in similar_interests:
        suggestions[interest] += similarity
  
  # 根據權重值進行排序
  suggestions = sorted(suggestions.items(),
              key = lambda pair:pair[-1],
              reverse = True)
  
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion,weight) for suggestion,weight in suggestions if suggestion not in users_interests[user_id]]

以0這位使用者的紀錄，推薦給他的結果如下

In [23]:
item_based_suggestions(0)

[('MapReduce', 1.861807319565799),
 ('MongoDB', 1.3164965809277263),
 ('Postgres', 1.3164965809277263),
 ('NoSQL', 1.2844570503761732),
 ('MySQL', 0.5773502691896258),
 ('databases', 0.5773502691896258),
 ('Haskell', 0.5773502691896258),
 ('programming languages', 0.5773502691896258),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('C++', 0.4082482904638631),
 ('Python', 0.2886751345948129),
 ('R', 0.2886751345948129)]

# 矩陣分解
運用模型來**預設**使用者針對給定項目所給出的評分

取得資料集  
* [原始資料位址](http://files.grouplens.org/datasets/movielens/ml-100k.zip)
* 將所需資料備份到Github，以pandas模組讀取

In [24]:
from dateutil.parser import parse
import io
import requests
url1="https://raw.githubusercontent.com/hank199599/data_science_from_scratch_reading_log/main/attached_data/CH%2023/u.item"
url2="https://raw.githubusercontent.com/hank199599/data_science_from_scratch_reading_log/main/attached_data/CH%2023/u.data"
MOVIES=requests.get(url1).content
RATINGS=requests.get(url2).content

引用NamedTuple來協助整理資料

In [25]:
from typing import NamedTuple

class Rating(NamedTuple):
  user_id:str
  movie_id:str
  rating:float

讀取電影檔案資料

In [26]:
import pandas as pd

data = pd.read_csv(io.StringIO(MOVIES.decode('iso-8859-1')),delimiter="|",header=None)
movies = {movie_id:title for movie_id,title,*_ in zip(data[0],data[1])}

讀取評分檔案資料  
[user_id,movie_id,rating],timestap  
‐‐‐‐‐取這三個欄位‐‐‐‐‐ 


In [27]:
import pandas as pd

data=pd.read_csv(io.StringIO(RATINGS.decode('iso-8859-1')),delimiter="\t",header=None)
ratings = {Rating(user_id,movie_id,float(raring)) for user_id,movie_id,raring in zip(data[0],data[1],data[2])}

驗證讀取的資料是否有誤

In [28]:
assert len(movies) == 1682
assert len(list({rating.user_id for rating in ratings})) == 943

針對這些資料，進行分析與探索

In [29]:
import re

# 按照多個電影編號進行累計，需要用到的一個資料結構
star_wars_ratings = {movie_id:[] for movie_id,title in movies.items() if re.search("Star Wars|Empire Strikes|Jedi",title)}

# 針對星際大戰系列電影，以迭代的方式累計計分
for rating in ratings:
  if rating.movie_id in star_wars_ratings:
    star_wars_ratings[rating.movie_id].append(rating.rating)
    
# 計算每部電影的平均分數
avg_ratings = [(sum(title_ratings)/len(title_ratings),movie_id) for movie_id,title_ratings in star_wars_ratings.items()]

# 依照排序列印相應的分數
for avg_rating,movie_id in sorted(avg_ratings,reverse=True):
  print(f"{avg_rating:.2f} {movies[movie_id]}")
                    

4.36 Star Wars (1977)
4.20 Empire Strikes Back, The (1980)
4.01 Return of the Jedi (1983)


試著提出一個模型為基準，再找出更好的結果  
把評分資料切分成訓練組資料、驗證組資料、測試組資料

In [30]:
import random
random.seed(0)
random.shuffle(list(ratings))

split1 = int(len(ratings)*0.7)
split2 = int(len(ratings)*0.85)

train = list(ratings)[:split1]       # 占所有資料的 70%        
validation = list(ratings)[split1:split2] # 占所有資料的 15%
test = list(ratings)[split2:]       # 占所有資料的 15%

### 以平均分數最為預測值  
以**均方差**做為衡量方式

In [31]:
avg_rating = sum(rating.rating for rating in train)/ len(train)
baseline_error = sum((rating.rating-avg_rating)**2 for rating in test) / len(test)

assert 1.26 < baseline_error < 1.27

### 內崁 (embedding)
讓使用者內崁與電影內崁進行矩陣相乘 → 預測的評分結果

In [32]:
import random
import math

Tensor = list

SQRT_TWO_PI = math.sqrt(2* math.pi)

def normal_pdf(x:float,mu:float=0,sigma:float=1) ->float:
  return (math.exp(-(x-mu)**2/2/sigma**2)/(SQRT_TWO_PI*sigma))

def normal_cdf(x:float,mu:float=0,sigma:float=1)->float:
  return (1+math.erf((x-mu)/math.sqrt(2)/sigma))/2

def inverse_normal_cdf(p:float,
            mu:float = 0,
            sigma:float = 1,
            tolerance:float=0.00001) -> float:
  # 如果不是標準常態分佈，就先轉換成標準常態分佈
  if mu != 0 or sigma != 1:
    return mu + sigma*inverse_normal_cdf(p,tolerance = tolerance)
  
  low_z = -10.0 # normal_cdf(-10)是(趨近於) 0
  hi_z =  10.0 # normal_cdf(10)是(趨近於) 1
  while hi_z - low_z > tolerance:  
    mid_z = (low_z + hi_z) / 2   # 計算出中間值
    mid_p = normal_cdf(mid_z)     # 以及累積分佈函數在該處所應對的值
    if mid_p < p :
      low_z = mid_z        #中間的值太低，就往上繼續搜尋
    else:
      hi_z = mid_z        #中間的值太高，就往下繼續搜尋
  
  return mid_z

In [33]:
# 自[0,1]的隨機分布中選擇初始值
def random_uniform(*dims:int) ->Tensor:
  if len(dims) == 1:
    return [random.random() for _ in range(dims[0])]
  else:
    return [random_uniform(*dims[1:]) for _ in range(dims[0])]

# 自標準常態分布中選擇初始值
def random_normal(*dims:int,mean:float=0.0,variance:float=1.0) ->Tensor:
  if len(dims) == 1:
    return [mean + variance * inverse_normal_cdf(random.random()) for _ in range(dims[0])]
  else:
    return [random_normal(*dims[1:],mean=mean,variance=variance) for _ in range(dims[0])]
  

In [34]:
def random_tensor(*dims:int,init:str = 'normal')->Tensor:
  if init == 'normal':
    return random_normal(*dims) # 自[0,1]的隨機分布中選擇初始值
  elif init == 'uniform':
    return random_uniform(*dims) # 自標準常態分布中選擇初始值
  elif init == 'xavier': 
    variance = len(dims)/sum(dims)
    return random_normal(*dims,variance=variance) #使用「Xavier initialization」，其中每個權重為平均是0，變異量為2/(num_inputs+num_outputs)的標準常態分布中選擇初始值
  else:
    raise ValueError(f"unknown init: {init}")

In [35]:
EMBEDDING_DIM = 2

# 找出所有不重複的編號
user_ids = {rating.user_id for rating in ratings}
movie_ids = {rating.movie_id for rating in ratings}

#針對每個編號，建立相應的隨機向量
user_vectors = {user_id:random_tensor(EMBEDDING_DIM) for user_id in user_ids}
movie_vectors = {movie_id:random_tensor(EMBEDDING_DIM) for movie_id in movie_ids}

編寫訓練迴圈：

In [36]:
from typing import List
import tqdm

def loop(dataset:List[Rating],learning_rate:float=None) -> None:
  with tqdm.tqdm(dataset) as t:
    loss = 0.0
    for i,rating in enumerate(t):
      movie_vector = movie_vectors[rating.movie_id]
      user_vector = user_vectors[rating.user_id]
      predicted = dot(user_vector,movie_vector)
      error = predicted - rating.rating
      loss += error**2
    
    if learning_rate is not None:
      # 預測值 = _m_0*u_0 + ... + m_k*u_k
      # 其中每個 u_j 的係數為 m_j
      # 每個 m_j 的係數則為 u_j
      user_gradient = [error*m_j for m_j in movie_vector]
      movie_gradient = [error*u_j for u_j in movie_vector]

      # 進行梯度遞減步驟
      for j in range(EMBEDDING_DIM):
        user_vector[j] -= learning_rate*user_gradient[j]
        movie_vector[j] -= learning_rate*movie_gradient[j]
    
    t.set_description(f"ang loss: {loss/(i+1)}")


開始訓練模型(找出最佳內崁)

In [37]:
learning_rate = 0.05
for epoch in range(20):
  learning_rate *= 0.9
  print(epoch,learning_rate)
  loop(train,learning_rate=learning_rate)
  loop(validation)

loop(test)

100%|██████████| 70000/70000 [00:00<00:00, 394121.07it/s]
  0%|          | 0/15000 [00:00<?, ?it/s]

0 0.045000000000000005


100%|██████████| 15000/15000 [00:00<00:00, 368993.86it/s]
100%|██████████| 70000/70000 [00:00<00:00, 458903.29it/s]
  0%|          | 0/15000 [00:00<?, ?it/s]

1 0.04050000000000001


100%|██████████| 15000/15000 [00:00<00:00, 344043.57it/s]
100%|██████████| 70000/70000 [00:00<00:00, 382025.72it/s]
  0%|          | 0/15000 [00:00<?, ?it/s]

2 0.03645000000000001


100%|██████████| 15000/15000 [00:00<00:00, 369112.93it/s]
100%|██████████| 70000/70000 [00:00<00:00, 473235.22it/s]
100%|██████████| 15000/15000 [00:00<00:00, 375374.00it/s]


3 0.03280500000000001
4 0.02952450000000001


100%|██████████| 70000/70000 [00:00<00:00, 463480.22it/s]
100%|██████████| 15000/15000 [00:00<00:00, 357885.95it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

5 0.02657205000000001


100%|██████████| 70000/70000 [00:00<00:00, 415800.46it/s]
100%|██████████| 15000/15000 [00:00<00:00, 413329.65it/s]
100%|██████████| 70000/70000 [00:00<00:00, 467663.18it/s]
100%|██████████| 15000/15000 [00:00<00:00, 419248.73it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

6 0.02391484500000001
7 0.021523360500000012


100%|██████████| 70000/70000 [00:00<00:00, 442199.24it/s]
100%|██████████| 15000/15000 [00:00<00:00, 379542.96it/s]
100%|██████████| 70000/70000 [00:00<00:00, 470416.87it/s]
100%|██████████| 15000/15000 [00:00<00:00, 421487.26it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

8 0.01937102445000001
9 0.01743392200500001


100%|██████████| 70000/70000 [00:00<00:00, 454407.44it/s]
100%|██████████| 15000/15000 [00:00<00:00, 433540.70it/s]
100%|██████████| 70000/70000 [00:00<00:00, 437619.10it/s]
  0%|          | 0/15000 [00:00<?, ?it/s]

10 0.015690529804500006


100%|██████████| 15000/15000 [00:00<00:00, 346127.51it/s]
100%|██████████| 70000/70000 [00:00<00:00, 479375.64it/s]
100%|██████████| 15000/15000 [00:00<00:00, 424579.13it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

11 0.014121476824050006
12 0.012709329141645007


100%|██████████| 70000/70000 [00:00<00:00, 451009.44it/s]
100%|██████████| 15000/15000 [00:00<00:00, 346396.22it/s]
100%|██████████| 70000/70000 [00:00<00:00, 446871.50it/s]
100%|██████████| 15000/15000 [00:00<00:00, 417111.26it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

13 0.011438396227480507
14 0.010294556604732457


100%|██████████| 70000/70000 [00:00<00:00, 462673.15it/s]
100%|██████████| 15000/15000 [00:00<00:00, 379767.48it/s]
100%|██████████| 70000/70000 [00:00<00:00, 446821.86it/s]
100%|██████████| 15000/15000 [00:00<00:00, 445296.17it/s]


15 0.00926510094425921
16 0.00833859084983329


100%|██████████| 70000/70000 [00:00<00:00, 444814.03it/s]
100%|██████████| 15000/15000 [00:00<00:00, 421730.24it/s]
100%|██████████| 70000/70000 [00:00<00:00, 435947.19it/s]
  0%|          | 0/15000 [00:00<?, ?it/s]

17 0.007504731764849962


100%|██████████| 15000/15000 [00:00<00:00, 423151.31it/s]
100%|██████████| 70000/70000 [00:00<00:00, 478361.85it/s]
100%|██████████| 15000/15000 [00:00<00:00, 417147.21it/s]
  0%|          | 0/70000 [00:00<?, ?it/s]

18 0.006754258588364966
19 0.00607883272952847


100%|██████████| 70000/70000 [00:00<00:00, 439777.00it/s]
100%|██████████| 15000/15000 [00:00<00:00, 403254.52it/s]
100%|██████████| 15000/15000 [00:00<00:00, 321792.21it/s]


### 運用主成分分析
檢查一下學習所得的向量

In [40]:
import math
from typing import List
Vector = List[float]

def add( v:Vector, w:Vector) -> Vector:
  assert len(v) == len(w) ,"兩個向量必須有相同的維度"

  return [ v_i+w_i for v_i,w_i in zip(v,w)]

def subtrate( v:Vector, w:Vector) -> Vector:
  assert len(v) == len(w) #兩個向量必須有相同的維度

  return [ v_i-w_i for v_i,w_i in zip(v,w)]

def magnitude(v:Vector)->float:
  return math.sqrt(sum_of_squares(v)) #math.sqrt 是計算平方根的一個函式

def sum_of_squares(v:Vector) -> float:
  return dot(v,v)

def scalar_multiply(c:float,v:Vector) -> Vector:
  return [c*v_i for v_i in v]

def project(v:Vector,w:Vector)->Vector:
  """送回v分量在w這個方向上的分量"""
  projection_length = dot(v,w)
  return scalar_multiply(projection_length,w)

def gradient_step(v:Vector,gradient:Vector,step_size:float) -> Vector:
  """從v沿著gradient的方向移動step_size的距離"""
  assert len(v) == len(gradient)
  step = scalar_multiply(step_size,gradient)
  return add(v,step)

def direction(w:Vector) ->Vector:
  mag = magnitude(w)
  return [w_i/mag for w_i in w]

def directional_variance_gradient(data:List[Vector],w:Vector) ->Vector:
  """
  變異量相當於w這個方向的梯度
  """
  w_dir = direction(w)
  return [sum(2*dot(v,w_dir)*v[i] for v in data) for i in range(len(w))]

def directional_variance(data:List[Vector],w:Vector) ->float:
  """
  送回x在w這個方向上的變異量
  """
  w_dir = direction(w)
  return sum(dot(v,w_dir)**2 for v in data)

import tqdm
def first_principal_component(data:List[Vector],n:int=100,step_size:float=0.1)->Vector:
  #先從一個隨機的起始點開始
  guess=[1.0 for _ in data[0]]

  with tqdm.trange(n) as t:
    for _ in t:
      dv = directional_variance(data,guess)
      gradient = directional_variance_gradient(data,guess)
      guess = gradient_step(guess,gradient,step_size)
      t.set_description(f"dv:{dv:.3f}")
    return direction(guess)

def pca(data:List[Vector],num_components:int)->List[Vector]:
  components:List[Vector]=[]
  for _ in range(num_components):
    component = first_principal_component(data)
    components.append(component)
    data = remove_projection(data,component)
  return components

def transform_vector(v:Vector,component:List[Vector]) ->Vector:
  return[dot(v,w) for w in components]

def transform(data:List[Vector],components:List[Vector])->List[Vector]:
  return [transform_vector(v,components) for v in data]

def remove_projection_from_vector(v:Vector,w:Vector) ->Vector:
  """把v減去v投影在w上的分量"""
  return subtrate(v,project(v,w))

def remove_projection(data:List[Vector],w:Vector) ->List[Vector]:
  return [remove_projection_from_vector(v,w) for v in data]

In [41]:
original_vectors = [vector for vector in movie_vectors.values()]
components = pca(original_vectors,2)

dv:1674.898: 100%|██████████| 100/100 [00:00<00:00, 107.48it/s]
dv:1614.672: 100%|██████████| 100/100 [00:00<00:00, 103.15it/s]


將向量轉換一下，使其呈現主成分的效果

In [46]:
ratings_by_movie = defaultdict(list)
for rating in ratings:
  ratings_by_movie[rating.movie_id].append(rating.rating)

vectors = [
      (movie_id,
       sum(ratings_by_movie[movie_id])/len(ratings_by_movie[movie_id]),
       movies[movie_id],
       vector)
      for movie_id,vector in zip(movie_vectors.keys(),transform(original_vectors,components))
      ]

# 根據第一主成分，把前25與倒數25個結果列印出來  
print(sorted(vectors,key=lambda v:v[-1][0])[:25])
print(sorted(vectors,key=lambda v:v[-1][0])[-25:])

[(395, 2.5892857142857144, 'Robin Hood: Men in Tights (1993)', [-3.950795442794101, -0.5972387433622903]), (800, 2.8846153846153846, 'In the Mouth of Madness (1995)', [-3.482077904648418, 1.2909615036381004]), (266, 2.7142857142857144, 'Kull the Conqueror (1997)', [-3.120470415450472, 0.07075116632325518]), (649, 3.4, 'Once Upon a Time in America (1984)', [-3.0205481874593563, -0.5167431667835751]), (1177, 1.8571428571428572, 'Dunston Checks In (1996)', [-2.826305223855793, 0.093467339115056]), (256, 3.5625, 'When the Cats Away (Chacun cherche son chat) (1996)', [-2.7944556913995084, -3.1269782320405706]), (336, 2.627906976744186, 'Playing God (1997)', [-2.7878495544438175, 0.9343730224366817]), (282, 3.685344827586207, 'Time to Kill, A (1996)', [-2.7668219537049374, -0.019320809815742912]), (359, 3.5, 'Assignment, The (1997)', [-2.684467130848594, -0.5143213056750233]), (261, 2.558139534883721, 'Air Bud (1997)', [-2.6581573658077904, 0.9158483447399699]), (367, 3.458823529411765, 'Clu