# Import relevant libraries

In [1]:
import pandas as pd
import re, requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import timeit
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.models import KeyedVectors, Word2Vec

SPECIAL_CHARS = '[^A-Za-z0-9 ]+'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hsichengyun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hsichengyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# create crawler class

In [2]:
class crawler:
    def __init__(self):
        """Initialise the crawler"""
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        
    def get_page(self,url):
        driver = self.driver
        driver.get(url)
    
    def scroll_down(self):
        driver = self.driver
        
        sleep(0.5)
        for i in range(20): 
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(0.6)
    
    def get_page_source(self):
        driver = self.driver
        return driver.page_source
    
    def get_title_links(self):
        driver = self.driver
        s = driver.page_source
        soup = BeautifulSoup(s, 'html.parser')
        source_titles = soup.findAll('a',{'class':'title'})
        links = [title['href'] for title in source_titles]
        return links

# set parameters

movies first

In [3]:
def flixable(content_type):
    """This function will return a flixable url with parameters set"""
    required_type = ['tv-shows','movies']
    if content_type in required_type:
        headers = {
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}

        url = f'https://flixable.com/netflix-originals/genre/{content_type}/#filterContainer'
        my_parameters = {'min-rating':0, 'min-year':1920, 'max-year':2019, 'order':'title'}

        re_url = requests.get(url , my_parameters,headers = headers).url
        return re_url
    
    else:
        print('KeyError. Please enter either tv-shows or movies')

In [None]:
crawler = crawler()
url = flixable('tv-shows')
crawler.get_page(url)

In [None]:
pd.set_option('display.max_columns', None)  

# Get title links

In [4]:
def parser(url):
    """Return bs4 object"""
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

def get_info(content_type, links):
    DF = pd.DataFrame(columns=['title', 'type', 'release_year', 'rating', 
                               'runtime', 'description', 'genres', 'cast',
                               'director', 'country', 'imdb_link', 'date_added'])
    for ix in range(len(links)):
        full_url = 'https://flixable.com' + links[ix]
        r = requests.get(full_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        
        title = soup.find('h1').text
        release_year = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[0].text
        rating = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[1].text
        runtime = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[2].text
        description = soup.find('p',{'class':'card-description'}).text
        tag_a = soup.find('div',{'class':'col-lg-8'}).findAll('a')
        genres = list()
        cast = list()
        director = list()
        country = list()
        imdb_link = list()
        for a in tag_a:
            if 'genre' in a['href']:
                genres.append(a.text)

            elif 'director' in a['href']:
                director.append(a.text)

            elif 'actor' in a['href']:
                cast.append(a.text)

            elif 'country' in a['href']:
                country.append(a.text)

            elif 'www.imdb.com' in a['href']:
                imdb_link.append(a['href'])
        date_added = soup.find('div',{'class':'col-lg-8'}).findAll('p')[-1].text.lstrip().rstrip()
        
        sep = ','
        
        df = pd.DataFrame(columns=['title', 'type', 'release_year', 'rating', 
                                   'runtime', 'description', 'genres', 'cast',
                                   'director', 'country', 'imdb_link', 'date_added'],
                         data = [{
                             'title': title,
                             'type': content_type,
                             'release_year': release_year,
                             'rating': rating,
                             'runtime': runtime,
                             'description': description,
                             'genres': sep.join(genres),
                             'cast': sep.join(cast),
                             'director': sep.join(director),
                             'country': sep.join(country),
                             'imdb_link': sep.join(imdb_link),
                             'date_added': date_added
                         }])
        DF = DF.append(df, ignore_index=True)
        
    return DF

In [None]:
# movie_links = crawler.get_title_links()
# movie_db = get_info('movies',movie_links)

In [None]:
# dataset = movie_db.append(tvshow_db, ignore_index=True)

In [None]:
# start = timeit.default_timer()
# tvshow_links = crawler.get_title_links()
# tvshow_db = get_info('tvshow', tvshow_links)
# end = timeit.default_timer()
# print(start-end)

In [5]:
def add_rv_votes(dataset):
    start = timeit.default_timer()
    rating_value = list()
    votes = list()
    count = 1
    for link in dataset['imdb_link']:
        if count == 100 or count == 300 or count == 500 or count == 700 or count == 900 or count == 1200 or count == 1400:
            print(count)
        count += 1
        try:
            soup = parser(link)
            ratingValue = soup.find('span', {'itemprop':'ratingValue'}).text
            ratingCount = soup.find('span', {'itemprop':'ratingCount'}).text
            rating_value.append(ratingValue)
            votes.append(ratingCount)
        except:
            rating_value.append('NaN')
            votes.append('NaN')
    
    dataset['rating_value'] = rating_value
    dataset['votes'] = votes
    
    stop = timeit.default_timer()
    t = int((stop-start)/60)
    print(f"It took {t} minutes to scrape rating values and votes.")

In [11]:
dataset = pd.read_csv('dataset.csv')

# NLP stage

In [13]:
def preprocess(text):
    tokenized = [word for sent in [re.sub(SPECIAL_CHARS, '', element).split(' ') for 
                                   element in nltk.sent_tokenize(text)] for word in sent]
    lowered = [word.lower() for word in tokenized]
    return lowered

In [14]:
description_texts = [preprocess(text) for text in dataset['description'].tolist()]

In [15]:
# Take out the stopwords
for i in range(len(description_texts)):
    description_texts[i] = [word for word in description_texts[i] if word not in stopwords.words('english')]

In [16]:
# I use movie title as unique key. So I map out the title and the tokenised sentences

title_text = dict(zip(dataset['title'].tolist(), description_texts))

In [17]:
model = Word2Vec(description_texts, min_count=2)

In [18]:
words = model.wv.vocab

In [19]:
vector = model.wv['media']

In [20]:
similar = model.wv.most_similar('media')

In [21]:
def get_vectors(first_map, second_map):
    first_vec  = dict()
    for uid, content in first_map.items():
        temp = list()
        for element in content:
            try:
                temp.append(second_map[element])
            except KeyError:
                pass
        first_vec[uid] = np.mean(temp, axis=0)
    
    return first_vec

In [22]:
title_vec = get_vectors(title_text, model)

  temp.append(second_map[element])


In [23]:
def get_most_similar(lookup_id):

    sim = list()
    
    lookup_map = title_vec
    subject_map = title_vec
#     else:
#         raise ValueError('Invalid value for parameter kind.')
        
    for uid, vec in lookup_map.items():
        thisSim = cosine_similarity(vec.reshape(1, -1), subject_map[lookup_id].reshape(1, -1))
        sim.append((uid, thisSim[0][0]))

    return sorted(sim, key=lambda x: x[1], reverse=True)

In [55]:
def top_10_similar(title):
    
    x = get_most_similar(title)[1:11]
    for e in x:
        rating = dataset.loc[dataset['title']==e[0]]['rating_value'].values.tolist()[0]
        votes = dataset.loc[dataset['title']==e[0]]['votes'].values.tolist()[0]
        print(f"Movie title: {e[0]}\nScores: {rating}\nVotes: {votes}\nSimilarity: {e[1]}\n")
        

In [56]:
top_10_similar('Black Mirror')

Movie title: Master of None
Scores: 8.3
Votes: 62,701
Similarity: 0.7177394032478333

Movie title: Terrace House: Aloha State
Scores: 7.4
Votes: 749
Similarity: 0.705254077911377

Movie title: Real Rob
Scores: 6.4
Votes: 2,297
Similarity: 0.6997315883636475

Movie title: Dark Tourist
Scores: 7.6
Votes: 5,353
Similarity: 0.6988909244537354

Movie title: See You Yesterday
Scores: 5.1
Votes: 8,154
Similarity: 0.6935927867889404

Movie title: The Confession Tapes
Scores: 7.5
Votes: 4,404
Similarity: 0.6907021403312683

Movie title: Motown Magic
Scores: 7.9
Votes: 234
Similarity: 0.6896839141845703

Movie title: COMEDIANS of the world
Scores: 6.5
Votes: 382
Similarity: 0.6853635311126709

Movie title: Felipe Neto: My Life Makes No Sense
Scores: 4.4
Votes: 181
Similarity: 0.6822994947433472

Movie title: The Sound of Your Heart
Scores: 8.3
Votes: 654
Similarity: 0.6809266805648804



In [54]:
filt = dataset['title'] == 'Stranger Things'
dataset.loc[filt]['votes'].values.tolist()[0]

'746,944'