© HeadFirst AI, 2020

In [None]:
## Run to set up: load model + data
from bs4 import BeautifulSoup
import gensim
from IPython.display import display, HTML
import numpy
from operator import itemgetter
import pandas
from pandas.core.common import SettingWithCopyWarning
from pprint import pprint
import re
import requests
import time
import warnings

warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')  
warnings.filterwarnings(action='ignore',category=FutureWarning,module='gensim')  
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

stopwords = set([
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'you re',
  'you ve',
  'you ll',
  'you d',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'she s',
  'her',
  'hers',
  'herself',
  'it',
  'it s',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'that ll',
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
  'off',
  'over',
  'under',
  'again',
  'further',
  'then',
  'once',
  'here',
  'there',
  'when',
  'where',
  'why',
  'how',
  'all',
  'any',
  'both',
  'each',
  'few',
  'more',
  'most',
  'other',
  'some',
  'such',
  'no',
  'nor',
  'not',
  'only',
  'own',
  'same',
  'so',
  'than',
  'too',
  'very',
  's',
  't',
  'can',
  'will',
  'just',
  'don',
  'don t',
  'should',
  'should ve',
  'now',
  'd',
  'll',
  'm',
  'o',
  're',
  've',
  'y',
  'ain',
  'aren',
  'aren t',
  'couldn',
  'couldn t',
  'didn',
  'didn t',
  'doesn',
  'doesn t',
  'hadn',
  'hadn t',
  'hasn',
  'hasn t',
  'haven',
  'haven t',
  'isn',
  'isn t',
  'ma',
  'mightn',
  'mightn t',
  'mustn',
  'mustn t',
  'needn',
  'needn t',
  'shan',
  'shan t',
  'shouldn',
  'shouldn t',
  'wasn',
  'wasn t',
  'weren',
  'weren t',
  'won',
  'won t',
  'wouldn',
  'wouldn t'
])

def youtube_url_from_id(youtube_id):
  return f"https://www.youtube.com/watch?v={youtube_id}"

def scrape_youtube_info(youtube_id):
  row = {"video_id": youtube_id}

  url = youtube_url_from_id(youtube_id)
  source = requests.get(url).text
  soup = BeautifulSoup(source, 'lxml')

  title = soup.find("title")
  row["title"] = title.text if title is not None else "None"

  meta_tags = "|".join([tag["content"] for tag in soup.findAll("meta", property="og:video:tag")])
  row["tags"] = meta_tags

  view_count = soup.find("meta", itemprop="interactionCount")
  row["views"] = int(view_count["content"]) if view_count is not None else 0.

  description = soup.find("meta", itemprop="description")
  row["description"] = description["content"] if description is not None else "None"

  like_count = re.search("\"label\":\"([\d,]+) likes\"", source)
  row["likes"] = int(like_count.group(1).replace(",", "")) if like_count is not None else 0.

  dislike_count = re.search("\"label\":\"([\d,]+) dislikes\"", source)
  row["dislikes"] = int(dislike_count.group(1).replace(",", "")) if dislike_count is not None else 0.

  category = re.search("\"category\":\"(\w+)\"", source)
  row["category"] = category.group(1) if category is not None else 0.

  channel_id = soup.find("meta", itemprop="channelId")
  if channel_id is not None:
    channel_id_str = channel_id["content"]
    channel_url = f"https://www.youtube.com/channel/{channel_id_str}"
    channel_source = requests.get(channel_url).text
    channel_soup = BeautifulSoup(channel_source, 'lxml')

    channel_title = channel_soup.find("title")
    row["channel_title"] = channel_title.text if channel_title is not None else "None"
  else:
    row["channel_title"] = "None"

  row = {k: [v] for k, v in row.items()}
  return pandas.DataFrame.from_dict(row)

def get_embedding(model, sentence):
  words = preprocess_tokenize(sentence)

  mean_embedding = get_mean_vector(model, words)

  return mean_embedding

def preprocess_tokenize(sentence):
  # Restrict to alphanumeric
  sentence = ''.join([ch if ch.isalnum() else ' ' for ch in str(sentence)])
  sentence = sentence.strip()

  # Tokenize and remove stopwords
  words = []
  tokens = sentence.split()

  for token in tokens:
    if len(token) == 0 or token in stopwords:
      continue

    token_words = [token_word.lower() for token_word in split_camel(token)]
    words.extend(token_words)

  return words

def split_camel(token):
  return re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', token)).split()

def get_mean_vector(model, words):
  # remove out-of-vocabulary words
  words = [word for word in words if word in model.vocab]
  if len(words) >= 1:
    return numpy.mean(model[words], axis=0)
  else:
    return None

def get_random_video_id(videos):
  return videos.sample().iloc[0]["video_id"]

def semantic_similarity(column, query_row, candidate_row):
  if query_row[column] == candidate_row[column]:
    return 1.

  semantic_column = f'{column}_semantic'

  vector1 = query_row[semantic_column]
  vector2 = get_embedding(model, candidate_row[column])

  if vector1 is None or vector2 is None:
    return 0.
  
  # TODO: to downsize, we can implement these methods ourselves
  return numpy.dot(vector1, vector2)/(numpy.linalg.norm(vector1)* numpy.linalg.norm(vector2))

# Assumes that videos are deduplicated by video_id
def find_by_id(youtube_id, videos):
  query_df = videos.loc[videos['video_id'] == youtube_id]
  
  if len(query_df) == 0:
    return None
  
  return query_df

def suggest_similar(youtube_id, videos, ranking_methods):
  query_df = find_by_id(youtube_id, videos)

  if query_df is None:
    query_df = scrape_youtube_info(youtube_id)

  query_row = query_df.iloc[0]

  # TODO: compute features on query_row
  for column, method in ranking_methods.items():
    if method == "semantic":
      semantic_column = f'{column}_semantic'
      query_row[semantic_column] = get_embedding(model, query_row[column])

  index_to_similarity = {}
  for tup in videos.itertuples():
    index = tup.Index
    candidate_row = videos.iloc[index]

    if query_row["video_id"] == candidate_row["video_id"]:
      continue
    
    similarities = []
    for column, method in ranking_methods.items():
      if method == "semantic":
        similarities.append(
          semantic_similarity(column, query_row, candidate_row)
        )
    
    index_to_similarity[index] = numpy.mean(similarities)
  
  top = sorted(index_to_similarity.items(), key=itemgetter(1), reverse=True)[:5]
  return top

# Load Model
start = time.time()
model = gensim.models.KeyedVectors.load_word2vec_format('./data/vectors.bin.gz', binary=True)
end = time.time()
print(f"Loaded model in {end - start} seconds")

# Load Data
start = time.time()
videos = pandas.read_csv('./data/USvideos.csv')
videos = videos.drop_duplicates(subset=['video_id'])
videos = videos.reset_index(drop=True)
end = time.time()
print(f"Loaded YouTube data in {end - start} seconds")

In [None]:
## We can define our model here in the format `data: similarity measure`
similarity_features = {
  "title": "semantic",
  #"tags": "semantic",
  #"channel_title": "semantic",
  #"description": "semantic",
  #"category": "semantic",
}

input_youtube_id = get_random_video_id(videos)

## We can also suggest similar videos for any YouTube ID
## A YouTube ID is the last part of the video URL
## e.g. the ID for https://www.youtube.com/watch?v=dQw4w9WgXcQ would be dQw4w9WgXcQ
## Uncomment this line below to specify an ID
# input_youtube_id = "dQw4w9WgXcQ"

In [None]:
## Get suggestions for most similar videos
start = time.time()
top_id_similarities = suggest_similar(input_youtube_id, videos, similarity_features)
end = time.time()
print(f"Computed suggestions in {end - start} seconds")

if input_youtube_id in videos["video_id"]:
  input_df = find_by_id(input_youtube_id, videos).copy()
else:
  input_df = scrape_youtube_info(input_youtube_id)
output_df = videos.iloc[[t[0] for t in top_id_similarities]].copy()

input_df["url"] = input_df["video_id"].apply(lambda x: youtube_url_from_id(x))
output_df["url"] = output_df["video_id"].apply(lambda x: youtube_url_from_id(x))
output_df.insert(0, "score", [round(t[1], 3) for t in top_id_similarities])

input_df = input_df.drop(columns=["video_id"])
output_df = output_df.drop(columns=["video_id", "tags", "likes", "dislikes", "comment_count", "description"])

display(HTML(input_df.to_html(index=False, notebook=True, render_links=True, justify="left")))
with pandas.option_context('display.max_colwidth', 75):
  display(HTML(output_df.to_html(index=False, notebook=True, render_links=True, justify="left")))

In [None]:
dog = model['dog']
print(dog.shape)
print(dog[:10])

# Deal with an out of dictionary word: CaseyNeistat
if 'Casey' in model:
    print(model['Casey'].shape)
else:
    print('{0} is an out of dictionary word'.format('Casey'))


# Some predefined functions that show content related information for given words
print(model.most_similar(positive=['woman', 'king'], negative=['man']))

print(model.doesnt_match("breakfast cereal dinner lunch".split()))

print(model.similarity('CaseyNeistat', 'CaseyNeistat'))