<a href="https://colab.research.google.com/github/hank199599/data_science_from_scratch_reading_log/blob/main/Chapter23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 推薦系統
根據資料做出某種**建議**或是**推薦**

In [51]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

### 目標
根據使用者目前的興趣，向他推薦其他的新興趣

# 土法煉鋼的作法
在有限資料下可以輕易推薦興趣  
隨著資料量上升，這種做法會變得越來越困難

# 推薦最流行的

In [52]:
from collections import Counter

popular_interests = Counter(interest for user_interests in users_interests for interest in user_interests)

print(popular_interests)

Counter({'Python': 4, 'R': 4, 'Big Data': 3, 'HBase': 3, 'Java': 3, 'statistics': 3, 'regression': 3, 'probability': 3, 'Hadoop': 2, 'Cassandra': 2, 'MongoDB': 2, 'Postgres': 2, 'scikit-learn': 2, 'statsmodels': 2, 'pandas': 2, 'machine learning': 2, 'libsvm': 2, 'C++': 2, 'neural networks': 2, 'deep learning': 2, 'artificial intelligence': 2, 'Spark': 1, 'Storm': 1, 'NoSQL': 1, 'scipy': 1, 'numpy': 1, 'decision trees': 1, 'Haskell': 1, 'programming languages': 1, 'mathematics': 1, 'theory': 1, 'Mahout': 1, 'MapReduce': 1, 'databases': 1, 'MySQL': 1, 'support vector machines': 1})


透過這個計算，可以單純向使用者推薦最受大家歡迎，而他目前還沒有興趣

In [53]:
from typing import List,Tuple

def most_popular_new_interests(user_interests:List[str],max_results:int=5)->List[Tuple[str,int]]:
  suggestions = [(interest,frequency) for interest,frequency in popular_interests.most_common() if interest not in users_interests]

  return suggestions[:max_results]

1號使用者

In [54]:
most_popular_new_interests(["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],5)

[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3)]

3號使用者

In [55]:
most_popular_new_interests(["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],5)

[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3)]

# 以「人」為基礎進行協同篩選

採用「餘弦相似度」(cosine similarity)來測量兩個單詞向量的相似程度

## 解析式集合(set compprehension)
將每個不同的興趣集中起來，然後再把所有這些興趣放進一個列表，最後進行排序。


In [56]:
unique_interests = sorted({interest for user_interests in users_interests for interest in user_interests})

針對每個使用者，製作出一個由0和1組成的「興趣向量」  
根據 unique_interests 列表進行迭代操作：遇到使用者有興趣的項目設為1，其餘設為0

In [57]:
def make_user_interest_vector(user_interests:List[str]) ->List[str]:
  """
   根據使用者的興趣列表，製作出使用者的興趣向量
   如果unique_interests[i]出現在列表中，第i個元素就設為1，否則就設為0
  """
  return [1 if interest in user_interests else 0 for interest in unique_interests]

製作出一個使用者興趣向量列表：

In [58]:
user_interest_vectors = [make_user_interest_vector(user_interests) for user_interests in users_interests]
# print(user_interest_vectors)

計算出兩兩成對使用者相似度

In [59]:
import math
def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

In [60]:
user_similarities = [[cosine_similarity(interest_vector_i,interest_vector_j) for interest_vector_j in user_interest_vectors] for interest_vector_i in user_interest_vectors]
print(user_similarities)

[[1.0, 0.3380617018914066, 0.0, 0.0, 0.0, 0.1543033499620919, 0.0, 0.0, 0.1889822365046136, 0.5669467095138409, 0.0, 0.0, 0.0, 0.1690308509457033, 0.0], [0.3380617018914066, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0], [0.0, 0.0, 1.0, 0.18257418583505536, 0.0, 0.16666666666666666, 0.0, 0.20412414523193154, 0.0, 0.0, 0.23570226039551587, 0.0, 0.47140452079103173, 0.0, 0.0], [0.0, 0.0, 0.18257418583505536, 1.0, 0.22360679774997896, 0.3651483716701107, 0.4472135954999579, 0.0, 0.0, 0.0, 0.5163977794943222, 0.22360679774997896, 0.5163977794943222, 0.0, 0.2581988897471611], [0.0, 0.0, 0.0, 0.22360679774997896, 1.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5773502691896258], [0.1543033499620919, 0.0, 0.16666666666666666, 0.3651483716701107, 0.0, 1.0, 0.0, 0.0, 0.0, 0.20412414523193154, 0.23570226039551587, 0.20412414523193154, 0.47140452079103173, 0.0, 0.0], [0.0, 0.0, 0.0, 0.4472135954999579, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.2886751345948129, 0.25, 0.0, 0.0, 

In [61]:
# 0 和 9 這兩個使用者共同的興趣為 Hadoop、Java、Big Data
assert 0.56 < user_similarities[0][9] < 0.58 ,"有好幾個共同的興趣"
# 0 和 8 這兩個使用者只有一個共同的興趣為：Big Data
assert 0.18 < user_similarities[0][8] < 0.20 ,"只有一個共同的興趣"

針對給定的使用者，找出另一個與他最相似的使用者

In [62]:
def most_similar_users_to(user_id:int) ->List[Tuple[int,float]]:
  pairs = [(other_user_id,similarity) for other_user_id,similarity in enumerate(user_similarities[user_id]) if user_id != other_user_id and similarity > 0]

  return sorted(pairs,key= lambda pair:pair[-1],reverse=True)

In [63]:
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

運用這些資料：  
針對每個興趣，把其他所有使用者相似度的值全部加總起來

In [64]:
from collections import defaultdict

def user_based_suggestions(user_id:int,include_current_interests:bool=False):
  # 把相似度加總起來
  suggestions:Dict[str,float]= defaultdict(float)
  for other_user_id,similarity in most_similar_users_to(user_id):
    for interest in users_interests[other_user_id]:
      suggestions[interest] += similarity
  
  # 把他們轉換成一個排序過的序列
  suggestions = sorted(suggestions.items(),
             key=lambda pair:pair[-1], #權重
             reverse=True)
  
  # 接著排除掉已經有的興趣
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion,weight) for suggestion,weight in suggestions if suggestion not in users_interests[user_id]]

調用針對使用者0的推薦興趣

In [65]:
user_based_suggestions(0)

[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('Python', 0.1543033499620919),
 ('R', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('programming languages', 0.1543033499620919)]

若項目數量過大，會發生維度的詛咒

# 以物為基礎進行協同篩選
直接計算興趣之間的相似度  
直接根據每個使用者目前的興趣，找出其他類似的興趣

In [66]:
interest_user_matrix = [[user_interest_vector[j] for user_interest_vector in user_interest_vectors] for j,_ in enumerate(unique_interests)]
# print(interest_user_matrix)

運用餘弦相似度的概念，找出彼此間的相似度

In [67]:
interest_similarities = [[cosine_similarity(user_vector_i,user_vector_j) for user_vector_j in interest_user_matrix] for user_vector_i in interest_user_matrix]

In [68]:
def most_similar_interests_to(interest_id:int):
  similarities = interest_similarities[interest_id]
  pairs = [(unique_interests[other_interest_id],similarity) for other_interest_id,similarity in enumerate(similarities) if interest_id != other_interest_id and similarity > 0]

  return sorted(pairs,
          key = lambda pair:pair[-1],
          reverse=True
          )

找出與Big Data (興趣0) 最相似的興趣

In [69]:
most_similar_interests_to(0)

[('Hadoop', 0.8164965809277261),
 ('Java', 0.6666666666666666),
 ('MapReduce', 0.5773502691896258),
 ('Spark', 0.5773502691896258),
 ('Storm', 0.5773502691896258),
 ('Cassandra', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('HBase', 0.3333333333333333)]

針對使用者的每個興趣，將類似興趣的相似度累加起來  
建立相對應的推薦資料

In [70]:
def item_based_suggestions(user_id:int,include_current_interests:bool=False):
  # 把類似興趣的相似度累加起來
  suggestions = defaultdict(float)

  user_interest_vector = user_interest_vectors[user_id]
  for interest_id,is_interested in enumerate(user_interest_vector):
    if is_interested == 1:
      similar_interests = most_similar_interests_to(interest_id)
      for interest,similarity in similar_interests:
        suggestions[interest] += similarity
  
  # 根據權重值進行排序
  suggestions = sorted(suggestions.items(),
              key = lambda pair:pair[-1],
              reverse = True)
  
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion,weight) for suggestion,weight in suggestions if suggestion not in users_interests[user_id]]

以0這位使用者的紀錄，推薦給他的結果如下

In [71]:
item_based_suggestions(0)

[('MapReduce', 1.861807319565799),
 ('MongoDB', 1.3164965809277263),
 ('Postgres', 1.3164965809277263),
 ('NoSQL', 1.2844570503761732),
 ('MySQL', 0.5773502691896258),
 ('databases', 0.5773502691896258),
 ('Haskell', 0.5773502691896258),
 ('programming languages', 0.5773502691896258),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('C++', 0.4082482904638631),
 ('Python', 0.2886751345948129),
 ('R', 0.2886751345948129)]

# 矩陣分解
運用模型來**預設**使用者針對給定項目所給出的評分

取得資料集  
* [原始資料位址](http://files.grouplens.org/datasets/movielens/ml-100k.zip)
* 將所需資料備份到Github，以pandas模組讀取

In [72]:
from dateutil.parser import parse
import io
import requests
url1="https://raw.githubusercontent.com/hank199599/data_science_from_scratch_reading_log/main/attached_data/CH%2023/u.item"
url2="https://raw.githubusercontent.com/hank199599/data_science_from_scratch_reading_log/main/attached_data/CH%2023/u.data"
MOVIES=requests.get(url1).content
RATINGS=requests.get(url2).content

引用NamedTuple來協助整理資料

In [73]:
from typing import NamedTuple

class Rating(NamedTuple):
  user_id:str
  movie_id:str
  rating:float

讀取電影檔案資料

In [78]:
import pandas as pd

data = pd.read_csv(io.StringIO(MOVIES.decode('iso-8859-1')),delimiter="|",header=None)
movies = {movie_id:title for movie_id,title,*_ in zip(data[0],data[1])}

讀取評分檔案資料  
[user_id,movie_id,rating],timestap  
‐‐‐‐‐取這三個欄位‐‐‐‐‐ 


In [75]:
import pandas as pd

data=pd.read_csv(io.StringIO(RATINGS.decode('iso-8859-1')),delimiter="\t",header=None)
ratings = {Rating(user_id,movie_id,float(raring)) for user_id,movie_id,raring in zip(data[0],data[1],data[2])}

驗證讀取的資料是否有誤

In [76]:
assert len(movies) == 1682
assert len(list({rating.user_id for rating in ratings})) == 943

針對這些資料，進行分析與探索

In [98]:
import re

# 按照多個電影編號進行累計，需要用到的一個資料結構
star_wars_ratings = {movie_id:[] for movie_id,title in movies.items() if re.search("Star Wars|Empire Strikes|Jedi",title)}

# 針對星際大戰系列電影，以迭代的方式累計計分
for rating in ratings:
  if rating.movie_id in star_wars_ratings:
    star_wars_ratings[rating.movie_id].append(rating.rating)
    
# 計算每部電影的平均分數
avg_ratings = [(sum(title_ratings)/len(title_ratings),movie_id) for movie_id,title_ratings in star_wars_ratings.items()]

# 依照排序列印相應的分數
for avg_rating,movie_id in sorted(avg_ratings,reverse=True):
  print(f"{avg_rating:.2f} {movies[movie_id]}")
                    

4.36 Star Wars (1977)
4.20 Empire Strikes Back, The (1980)
4.01 Return of the Jedi (1983)


試著提出一個模型為基準，再找出更好的結果

In [105]:
import random
random.seed(0)
#random.shuffle(ratings)

split1 = int(len(ratings)*0.7)
split2 = int(len(ratings)*0.85)

train = ratings[:split1]       # 占所有資料的 70%        
validation = ratings[split1:split2] # 占所有資料的 15%
test = ratings[split2:]       # 占所有資料的 15%

TypeError: ignored