© HeadFirst AI, 2020

# Building a YouTube Video Recommender
Let's build our own AI for making YouTube video recommendations! This notebook contains data for 6,000 YouTube videos that were trending from 2017 to 2018. By experimenting with various data and features, can you design the best recommender system?

**To get started, run the cell below.**

In [None]:
## Run to set up: load model + data
from bs4 import BeautifulSoup
import gensim
from IPython.display import display, HTML
import numpy
from operator import itemgetter
import pandas
from pandas.core.common import SettingWithCopyWarning
from pprint import pprint
import re
import requests
import time
import warnings

print("Loading model...")

warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')  
warnings.filterwarnings(action='ignore',category=FutureWarning,module='gensim')  
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

stopwords = set([
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'you re',
  'you ve',
  'you ll',
  'you d',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'she s',
  'her',
  'hers',
  'herself',
  'it',
  'it s',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'that ll',
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
  'off',
  'over',
  'under',
  'again',
  'further',
  'then',
  'once',
  'here',
  'there',
  'when',
  'where',
  'why',
  'how',
  'all',
  'any',
  'both',
  'each',
  'few',
  'more',
  'most',
  'other',
  'some',
  'such',
  'no',
  'nor',
  'not',
  'only',
  'own',
  'same',
  'so',
  'than',
  'too',
  'very',
  's',
  't',
  'can',
  'will',
  'just',
  'don',
  'don t',
  'should',
  'should ve',
  'now',
  'd',
  'll',
  'm',
  'o',
  're',
  've',
  'y',
  'ain',
  'aren',
  'aren t',
  'couldn',
  'couldn t',
  'didn',
  'didn t',
  'doesn',
  'doesn t',
  'hadn',
  'hadn t',
  'hasn',
  'hasn t',
  'haven',
  'haven t',
  'isn',
  'isn t',
  'ma',
  'mightn',
  'mightn t',
  'mustn',
  'mustn t',
  'needn',
  'needn t',
  'shan',
  'shan t',
  'shouldn',
  'shouldn t',
  'wasn',
  'wasn t',
  'weren',
  'weren t',
  'won',
  'won t',
  'wouldn',
  'wouldn t'
])

def youtube_url_from_id(youtube_id):
  return f"https://www.youtube.com/watch?v={youtube_id}"

def scrape_youtube_info(youtube_id):
  row = {"video_id": youtube_id}

  url = youtube_url_from_id(youtube_id)
  source = requests.get(url).text
  soup = BeautifulSoup(source, 'lxml')

  title = soup.find("title")
  row["title"] = title.text if title is not None else "None"

  meta_tags = "|".join([tag["content"] for tag in soup.findAll("meta", property="og:video:tag")])
  row["tags"] = meta_tags

  view_count = soup.find("meta", itemprop="interactionCount")
  row["views"] = int(view_count["content"]) if view_count is not None else 0.

  description = soup.find("meta", itemprop="description")
  row["description"] = description["content"] if description is not None else "None"

  like_count = re.search("\"label\":\"([\d,]+) likes\"", source)
  row["likes"] = int(like_count.group(1).replace(",", "")) if like_count is not None else 0.

  dislike_count = re.search("\"label\":\"([\d,]+) dislikes\"", source)
  row["dislikes"] = int(dislike_count.group(1).replace(",", "")) if dislike_count is not None else 0.

  category = re.search("\"category\":\"(\w+)\"", source)
  row["category"] = category.group(1) if category is not None else 0.

  channel_id = soup.find("meta", itemprop="channelId")
  if channel_id is not None:
    channel_id_str = channel_id["content"]
    channel_url = f"https://www.youtube.com/channel/{channel_id_str}"
    channel_source = requests.get(channel_url).text
    channel_soup = BeautifulSoup(channel_source, 'lxml')

    channel_title = channel_soup.find("title")
    row["channel_title"] = channel_title.text if channel_title is not None else "None"
  else:
    row["channel_title"] = "None"

  row = {k: [v] for k, v in row.items()}
  return pandas.DataFrame.from_dict(row)

def get_embedding(model, sentence):
  words = preprocess_tokenize(sentence)

  mean_embedding = get_mean_vector(model, words)

  return mean_embedding

def preprocess_tokenize(sentence):
  # Restrict to alphanumeric
  sentence = ''.join([ch if ch.isalnum() else ' ' for ch in str(sentence)])
  sentence = sentence.strip()

  # Tokenize and remove stopwords
  words = []
  tokens = sentence.split()

  for token in tokens:
    if len(token) == 0 or token in stopwords:
      continue

    token_words = [token_word.lower() for token_word in split_camel(token)]
    words.extend(token_words)

  return words

def split_camel(token):
  return re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', token)).split()

def get_mean_vector(model, words):
  # remove out-of-vocabulary words
  words = [word for word in words if word in model.vocab]
  if len(words) >= 1:
    return numpy.mean(model[words], axis=0)
  else:
    return None

def get_random_video_id(videos):
  return videos.sample().iloc[0]["video_id"]

def semantic_similarity(column, query_row, candidate_row):
  if query_row[column] == candidate_row[column]:
    return 1.

  semantic_column = f'{column}_semantic'

  vector1 = query_row[semantic_column]
  vector2 = get_embedding(model, candidate_row[column])

  if vector1 is None or vector2 is None:
    return 0.
  
  # TODO: to downsize, we can implement these methods ourselves
  return numpy.dot(vector1, vector2)/(numpy.linalg.norm(vector1)* numpy.linalg.norm(vector2))

def edit_distance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

def character_similarity(column, query_row, candidate_row):
  if query_row[column] == candidate_row[column]:
    return 1.

  s1 = str(query_row[column])[:20]
  s2 = str(candidate_row[column])[:20]
  denominator = 1. * (len(s1) + len(s2))
  
  return 1. - edit_distance(s1, s2) / denominator

def numeric_similarity(column, query_row, candidate_row):
  if query_row[column] == candidate_row[column]:
    return 1.

  try:
    num1 = float(query_row[column])
    num2 = float(candidate_row[column])

    return 1. - abs(num1 - num2) / max([num1, num2])
  except:
    return 0.

# Assumes that videos are deduplicated by video_id
def find_by_id(youtube_id, videos):
  query_df = videos.loc[videos['video_id'] == youtube_id]
  
  if len(query_df) == 0:
    return None
  
  return query_df

def suggest_similar(youtube_id, videos, ranking_methods):
  query_df = find_by_id(youtube_id, videos)

  if query_df is None:
    query_df = scrape_youtube_info(youtube_id)

  query_row = query_df.iloc[0]

  for column, method in ranking_methods.items():
    if method == "semantic":
      semantic_column = f'{column}_semantic'
      query_row[semantic_column] = get_embedding(model, query_row[column])

  index_to_similarity = {}
  for tup in videos.itertuples():
    index = tup.Index
    candidate_row = videos.iloc[index]

    if query_row["video_id"] == candidate_row["video_id"]:
      continue
    
    similarities = []
    for column, method in ranking_methods.items():
      if method == "semantic":
        similarities.append(
          semantic_similarity(column, query_row, candidate_row)
        )
      elif method == "character":
        similarities.append(
          character_similarity(column, query_row, candidate_row)
        )
      elif method == "numeric":
        similarities.append(
          numeric_similarity(column, query_row, candidate_row)
        )

    index_to_similarity[index] = numpy.mean(similarities)
  
  top = sorted(index_to_similarity.items(), key=itemgetter(1), reverse=True)[:5]
  return top

# Load Model
start = time.time()
model = gensim.models.KeyedVectors.load_word2vec_format('./data/vectors.bin.gz', binary=True)
end = time.time()
print(f"Loaded model in {end - start} seconds")

# Load Data
start = time.time()
videos = pandas.read_csv('./data/USvideos.csv')
videos = videos.drop_duplicates(subset=['video_id'])
videos = videos.reset_index(drop=True)
end = time.time()
print(f"Loaded YouTube data in {end - start} seconds")

## Exploring Our Data
Let's get familiar with the data we have to work with. We can do so by randomly sampling a couple of YouTube videos from our dataset.

Try collecting at least 5 samples. What do you notice about the data? Do you see any features that might be useful for making recommendations?

**Run the cell below to collect a sample.** (You may have to wait for the cell above to finish loading.)

In [None]:
## Display a random sample
input_youtube_id = get_random_video_id(videos)
input_df = find_by_id(input_youtube_id, videos).copy()
input_df["url"] = input_df["video_id"].apply(lambda x: youtube_url_from_id(x))
input_df = input_df.drop(columns=["video_id"])
display(HTML(input_df.to_html(index=False, notebook=True, render_links=True, justify="left")))

## How AI understands text
As you can see, a lot of the data we've gathered on each YouTube is textual - such as title, tags, and description. As we discussed in class, there are a couple of ways that an AI can learn to understand text.

### Character Similarity
The **edit distance** algorithm measures how many single-character changes (removals, insertions, or substitutions) are required to transform one word into another. Edit distance is famous for applications such as spell-check and DNA sequence analysis.

Let's try with a couple of words! Run the cell below to see an example of edit distance in action. Then, try substituting your own words into the code (in variables `word1` and `word2`).

In [None]:
word1 = "their"
word2 = "there"
print(f'The edit distance between "{word1}" and "{word2}" is {edit_distance(word1, word2)}')

### Semantic Similarity
Semantic similarity is an AI technique that measures how close two words are in _meaning_. AI models discover the meaning of words by training on tasks such as predicting the missing word in a sentence. After lots of feedback, the AI learns to represent every word it has seen by mapping each word to a vector (like a 2D point, but with more dimensions).

This technique is called a **word embedding**. Once these word vectors are known, we can compute the similarity of word meanings by using distance metrics like the Pythagorean Theorem.

Let's try it out! As before, run the cell below to see an example of semantic similarity in action. Then, try substituting your own words into the code (in variables `word1` and `word2`).

In [None]:
word1 = "their"
word2 = "there"

## Check to see if the model recognizes the words
if word1 not in model:
  print(f'The model does not recognize {word1}')
elif word2 not in model:
  print(f'The model does not recognize {word2}')
else:
  sim = model.similarity(word1, word2)
  percentage = round(sim * 100, 1)
  print(f'The semantic similarity between "{word1}" and "{word2}" is {percentage}%')

### Discussion Questions
1. What did you notice when looking at both methods? Did anything surprise you?
2. What are the strengths of edit distance? What are the strengths of semantic similarity?
3. Which scenarios are best suited for each method?

## Building a YouTube Recommender
Now, we are ready to build an AI that can recommend YouTube videos based on textual features! In the cell below, we've provided a playground where we can customize which features our AI should use when suggesting similar videos. (Don't worry, you don't need to know how to code in order to customize the model).

### Instructions: Defining the Model
Edit the AI model by updating the lines under `similarity_features` below. Each row is in the format **"data": "similarity measure"** where "data" refers to an aspect of a YouTube video (such as "title") and "similarity measure" refers to the type of similarity algorithm to use (such as "semantic"). The model will generate suggestions according to the data and similarity measures you define.

Valid options for similarity measure are included in the comments on each line (aka. the green text following the '#'). For textual data, either semantic or character similarity can be applied. For numeric data, numeric similarity can be applied (aka. percentage difference).

Choosing "none" will omit the data from the model.

### Instructions: Choosing an Input Video
The model will read in one input video that it will try to find similar videos to (you can think of this as a video you've liked that YouTube will suggest additional videos for). By default, the model will choose a random input video from the dataset. You may also uncomment the last line to feed the model a custom YouTube ID from any video on YouTube.

**Run the cell below to set your model and input video. Then, run the next cell to see your suggestions.**



In [None]:
## We can define our model here in the format `data: similarity measure`
similarity_features = {
  "title": "character",       # options: "none", "semantic", "character"
  "tags": "none",             # options: "none", "semantic", "character"
  "channel_title": "none",    # options: "none", "semantic", "character"
  "description": "none",      # options: "none", "semantic", "character"
  "category": "none",         # options: "none", "semantic", "character"
  "views": "none",            # options: "none", "numeric"
  "likes": "none",            # options: "none", "numeric"
  "dislikes": "none",         # options: "none", "numeric"
}

print("Model defined. Ready to compute suggestions.")

input_youtube_id = get_random_video_id(videos)

## We can also suggest similar videos for any YouTube ID
## A YouTube ID is the last part of the video URL
## e.g. the ID for https://www.youtube.com/watch?v=dQw4w9WgXcQ would be dQw4w9WgXcQ
## Uncomment the line below to specify an ID
# input_youtube_id = "dQw4w9WgXcQ"

**Run the cell below to see what the AI suggested!** (This may take up to a minute)

In [None]:
## Get suggestions for most similar videos
print("Computing suggestions...")
start = time.time()
top_id_similarities = suggest_similar(input_youtube_id, videos, similarity_features)
end = time.time()
print(f"Computed suggestions in {end - start} seconds")

if find_by_id(input_youtube_id, videos) is not None:
  input_df = find_by_id(input_youtube_id, videos).copy()
else:
  input_df = scrape_youtube_info(input_youtube_id)
output_df = videos.iloc[[t[0] for t in top_id_similarities]].copy()

input_df["url"] = input_df["video_id"].apply(lambda x: youtube_url_from_id(x))
output_df["url"] = output_df["video_id"].apply(lambda x: youtube_url_from_id(x))
output_df.insert(0, "score", [round(t[1], 3) for t in top_id_similarities])

input_df = input_df.drop(columns=["video_id"])
output_df = output_df.drop(columns=["video_id", "tags", "likes", "dislikes", "description"])

display(HTML('<h2>Input Video</h2>'))
display(HTML(input_df.to_html(index=False, notebook=True, render_links=True, justify="left")))
with pandas.option_context('display.max_colwidth', 75):
  display(HTML('<h2>Output Suggestions</h2>'))
  display(HTML(output_df.to_html(index=False, notebook=True, render_links=True, justify="left")))

## Discussion Questions
1. How did the recommendations change as the input features (data + similarity measures) were changed?
2. In your opinion, what single feature works the best for recommending videos? Why?
3. In your opinion, what group of features works the best for recommending videos?
4. Suppose you are the CEO of YouTube. How would you decide how to evaluate the quality of recommendations?
5. If you were to extend this recommender, what additional videos data would you gather? What other similarity measures can you come up with?