# Inside Movies

Analysing movies data from IMDB.

In [None]:
# Import necessary modules

import re
import requests
from bs4 import BeautifulSoup

from ipywidgets import widgets
from IPython.display import display, Javascript
import json

## Parse HTML from IMDB and generate Network

Use BeautifulSoup for pulling data out of IMDB titles and persons pages to generate a network visualization.

### Helpers

In [None]:
# Based on URL extract the IMDB id
def get_imdb_id(href_text):
    m = re.search('(?:[\w]*[\d]{7})', href_text)

    if m:
        return m.group(0)

# Generate soup from URL
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

### Network base

In [None]:
# Initialize network data
nodes = []
edges = []

def add_node(id_, label, group, image = None):
    search = [node for node in nodes if node['id'] == id_]
    
    if(len(search) == 0):
        node = {
            'id': id_,
            'label': label,
            'group': group
        }
        if image is not None:
            node['shape'] = 'circularImage'
            node['image'] = image
            node['size']  = 20
        nodes.append(node)
        return node
    else:
        return search[0]

def add_edge(node_from, node_to, label):
    search = [edge for edge in edges if (edge['from'] == node_from['id'] and edge['to'] == node_to['id'] and edge['label'] == label)]
    
    if(len(search) == 0):
        edges.append({ 'from': node_from['id'], 'to': node_to['id'], 'label': label })

### Parse Person page

In [None]:
def parse_person(imdb_id, max_depth, base_node = None, current_depth = 1):
    if(current_depth > max_depth):
        return

    base_url = 'http://www.imdb.com/name/'
    soup = get_soup(base_url + imdb_id)
    
    image = soup.find('td', id='img_primary').find('img')
    if image is not None:
        image = image.get('src')
        base_node['shape'] = 'circularImage'
        base_node['image'] = image
        base_node['size']  = 20
    
    knowfor_tags = soup.select('div#knownfor div.knownfor-title')
    movies = [{
        'id': get_imdb_id(movie_tag.find('div', class_ = 'knownfor-title-role').find('a').get('href')),
        'name': movie_tag.find('div', class_ = 'knownfor-title-role').find('a').text.strip(),
        'url': movie_tag.find('div', class_ = 'knownfor-title-role').find('a').get('href'),
        'image': movie_tag.find('img').get('src') if movie_tag.find('img') else None
    } for movie_tag in knowfor_tags]

    for movie in movies:
        node = add_node(movie['id'], movie['name'], 'Movie', movie['image'])
        add_edge(node, base_node, 'KF')
        parse_title(imdb_id = movie['id'], max_depth = max_depth, base_node = node, current_depth = current_depth + 1)

### Parse Title page

In [None]:
def parse_title(imdb_id, max_depth, base_node = None, current_depth = 1):
    if(current_depth > max_depth):
        return

    base_url = 'http://www.imdb.com/title/'
    soup = get_soup(base_url + imdb_id)
    
    if(not base_node):
        title_tags = soup.find_all('h1', itemprop='name')[0]
        title = title_tags.text
        year = title_tags.select('span#titleYear')[0].text
        movie = title.replace(year, '').strip()
        movie_image = soup.find('div', class_ = 'poster')
        if movie_image:
            movie_image = movie_image.find('img').get('src')
        
        base_node = add_node(imdb_id, movie, 'Movie', movie_image)
        
    def get_plot_data(soup, url_base, key):
        item_tags = soup.find_all('span', itemprop = key, itemtype = 'http://schema.org/Person')
        return [{
            'id': get_imdb_id(tag.find_all('a')[0].get('href')),
            'name': tag.find_all('span', itemprop='name')[0].text,
            'url': url_base + tag.find_all('a')[0].get('href')
        } for tag in item_tags]

    directors = get_plot_data(soup, base_url, 'director')
    writers = get_plot_data(soup, base_url, 'creator')
    stars = get_plot_data(soup, base_url, 'actors')
    genres = soup.select('.title_wrapper')[0].find_all('a', href = re.compile('/genre/'))
    genres = [{
        'id': genre.get('href').split('?ref')[0],
        'name': genre.text,
        'url': base_url + genre.get('href')
    } for genre in genres]

    for genre in genres:
        add_edge(add_node(genre['id'], genre['name'], 'Genre'), base_node, 'G')

    def add_person_items(items, label):
        for item in items:
            node = add_node(item['id'], item['name'], 'Person')
            add_edge(node, base_node, label)

            parse_person(
                imdb_id = item['id'],
                max_depth = max_depth,
                base_node = node,
                current_depth = current_depth + 1
            )

    add_person_items(directors, 'D')
    add_person_items(writers, 'W')
    add_person_items(stars, 'S')

### Parse the data and populate network

In [None]:
def populate_network_data(imdb_url, max_depth = 3):
    imdb_id = get_imdb_id(imdb_url)
    
    if imdb_id is None:
        print('Could not extract IMDB id from the URL. Insert a valid IMDB url entry.')
        return

    if 'tt' in imdb_id:
        parse_title(imdb_id, max_depth = max_depth)
    elif 'nm' in imdb_id:
        parse_person(imdb_id, max_depth = max_depth)
    else:
        print('This type of entry is not parseable.')

    print('Data is loaded! Run the next cell.')

## Select Movie input

In [None]:
search_input = widgets.Text(placeholder='Pulp Fiction...')
index_input = widgets.Text(placeholder='1...')
label_results = widgets.HTML(value='')

box_search = widgets.VBox([
    widgets.Label(value='Search movie:'),
    search_input,
    label_results
])

display(box_search)

titles = []
title = None
movie = ''

# Find IMDB titles results for the query
def handle_search_submit(sender):
    label_results.value = 'Loading...'
    
    movie = search_input.value
    
    query_token = '+'.join(search_input.value.split(' '))
    search_url = 'http://www.imdb.com/find?q={}&s=all'.format(query_token)
    page = requests.get(search_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    sections = soup.find_all('div', class_='findSection')

    has_titles = lambda tag : 'Titles' in tag.find('h3', class_='findSectionHeader').text
    section = [section for section in sections if has_titles(section)]

    if len(section) > 0:
        result_list = section[0].find('table', class_='findList').find_all('td', class_='result_text')

    titles_html = ''

    for item in result_list:
        titles.append({
            'url': item.find('a').get('href'),
            'text': item.text
        })
        titles_html += '<b>{0}</b> - {1}<br/>'.format(len(titles), item.text)
    
    label_results.value = titles_html
    
    box_index = widgets.VBox([
        widgets.Label(value="Type the index of the desired movie to analyze:"),
        index_input
    ])
    display(box_index)

# Select the item to generate network
def handle_index_submit(sender):  
    index = int(index_input.value) - 1
    title = titles[index]
    print('Loading data for: ' + title['text'] + '\nWait...')
    populate_network_data(title['url'])

search_input.on_submit(handle_search_submit)
index_input.on_submit(handle_index_submit)

## Show Movie network visualization

> See in the README how to install vis.js library

In [None]:
%%html
<div id="imdb-movie-network">HTML element to be a container for the network</div>

In [None]:
# Increase the size of the first node
if 'size' in nodes[0]:
    nodes[0]['size'] *= 2

# Transform the graph into a JSON graph
data = { 'nodes': nodes, 'edges': edges }
jsonGraph = json.dumps(data, indent = 4)

# Clear nodes and edges
nodes = []
edges = []

# Send to Javascript
Javascript("""window.jsonGraph={};""".format(jsonGraph))

### Load vis and generate network visualization inside the container

In [None]:
%%javascript
requirejs.config({
    paths: {
        vis: 'vis'
    }
});

require(['vis'], function(vis){
    var container = document.getElementById('imdb-movie-network');
    var options = {
        width: '900px',
        height: '500px',
        nodes: {
            shape: 'dot',
            size: 10,
            borderWidth: 2
        },
        edges: {
            font: {
                size: 12
            }
        }
    };
    
    // Load the JSON graph we generated from IPython input
    var graph = window.jsonGraph;
    
    // Display Graph
    var network = new vis.Network(container, graph, options);
});

> View the network inside the HTML container. (4 cells above)

# References


- [https://www.codementor.io/isaib.cicourel/visjs-visualization-in-jupyter-notebook-phgb3fjv0][1]
- [https://ipywidgets.readthedocs.io/en/latest/][2]
- [http://visjs.org/docs/network/][3]

[1]: https://www.codementor.io/isaib.cicourel/visjs-visualization-in-jupyter-notebook-phgb3fjv0
[2]: https://ipywidgets.readthedocs.io/en/latest/
[3]: http://visjs.org/docs/network/