# Inside Movies

Analysing movies data.

In [None]:
import re
import requests
from bs4 import BeautifulSoup

In [None]:
nodes = []
edges = []

def get_imdb_id(href_text):
    m = re.search('(?:[\w]*[\d]{7})', href_text)

    if m:
        return m.group(0)
    
def get_imdb_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def get_plot_data(soup, url_base, key):
    item_tags = soup.find_all('span', itemprop = key, itemtype = 'http://schema.org/Person')
    return [{
        'id': get_imdb_id(tag.find_all('a')[0].get('href')),
        'name': tag.find_all('span', itemprop='name')[0].text,
        'url': url_base + tag.find_all('a')[0].get('href')
    } for tag in item_tags]

def add_node(id_, label, group, image = None):
    search = [node for node in nodes if node['id'] == id_]
    
    if(len(search) == 0):
        node = {
            'id': id_,
            'label': label,
            'group': group
        }
        if image is not None:
            node['shape'] = 'circularImage'
            node['image'] = image
            node['size']  = 20
        nodes.append(node)
        return node
    else:
        return search[0]

def add_edge(node_from, node_to, label):
    search = [edge for edge in edges if (edge['from'] == node_from['id'] and edge['to'] == node_to['id'] and edge['label'] == label)]
    
    if(len(search) == 0):
        edges.append({ 'from': node_from['id'], 'to': node_to['id'], 'label': label })

def parse_person(imdb_id, max_depth, base_node = None, current_depth = 1):
    if(current_depth > max_depth):
        return

    base_url = 'http://www.imdb.com/name/'
    soup = get_imdb_soup(base_url + imdb_id)
    
    image = soup.find('td', id='img_primary').find('img')
    if image is not None:
        image = image.get('src')
        base_node['shape'] = 'circularImage'
        base_node['image'] = image
        base_node['size']  = 20
    
    knowfor_tags = soup.select('div#knownfor div.knownfor-title')
    movies = [{
        'id': get_imdb_id(movie_tag.find('div', class_ = 'knownfor-title-role').find('a').get('href')),
        'name': movie_tag.find('div', class_ = 'knownfor-title-role').find('a').text.strip(),
        'url': movie_tag.find('div', class_ = 'knownfor-title-role').find('a').get('href'),
        'image': movie_tag.find('img').get('src') if movie_tag.find('img') else None
    } for movie_tag in knowfor_tags]

    for movie in movies:
        node = add_node(movie['id'], movie['name'], 'Movie', movie['image'])
        add_edge(node, base_node, 'KF')
        parse_title(imdb_id = movie['id'], max_depth = max_depth, base_node = node, current_depth = current_depth + 1)
    
def parse_title(imdb_id, max_depth, base_node = None, current_depth = 1):
    if(current_depth > max_depth):
        return

    base_url = 'http://www.imdb.com/title/'
    soup = get_imdb_soup(base_url + imdb_id)
    
    if(not base_node):
        title_tags = soup.find_all('h1', itemprop='name')[0]
        title = title_tags.text
        year = title_tags.select('span#titleYear')[0].text
        movie = title.replace(year, '').strip()
        movie_image = soup.find('div', class_ = 'poster')
        if movie_image:
            movie_image = movie_image.find('img').get('src')
        
        base_node = add_node(imdb_id, movie, 'Movie', movie_image)

    directors = get_plot_data(soup, base_url, 'director')
    writers = get_plot_data(soup, base_url, 'creator')
    stars = get_plot_data(soup, base_url, 'actors')
    genres = soup.select('.title_wrapper')[0].find_all('a', href = re.compile('/genre/'))
    genres = [{
        'id': genre.get('href').split('?ref')[0],
        'name': genre.text,
        'url': base_url + genre.get('href')
    } for genre in genres]

    for genre in genres:
        add_edge(add_node(genre['id'], genre['name'], 'Genre'), base_node, 'G')

    def add_person_items(items, label):
        for item in items:
            node = add_node(item['id'], item['name'], 'Person')
            add_edge(node, base_node, label)

            parse_person(
                imdb_id = item['id'],
                max_depth = max_depth,
                base_node = node,
                current_depth = current_depth + 1
            )

    add_person_items(directors, 'D')
    add_person_items(writers, 'W')
    add_person_items(stars, 'S')
        
def parse_entry(imdb_url, max_depth = 3):
    imdb_id = get_imdb_id(imdb_url)

    if 'tt' in imdb_id:
        parse_title(imdb_id, max_depth = max_depth)
    elif 'nm' in imdb_id:
        parse_person(imdb_id, max_depth = max_depth)

    print('Data is loaded!')

In [None]:
from ipywidgets import widgets
from IPython.display import display

search_input = widgets.Text(placeholder='Pulp Fiction...')
index_input = widgets.Text(placeholder='1...')
label_results = widgets.HTML(value='')

box_search = widgets.VBox([
    widgets.Label(value="Search movie:"),
    search_input,
    label_results
])

display(box_search)

titles = []
title = None

def handle_search_submit(sender):
    label_results.value = 'Loading...'
    
    query_token = '+'.join(search_input.value.split(' '))
    search_url = 'http://www.imdb.com/find?q={}&s=all'.format(query_token)
    page = requests.get(search_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    sections = soup.find_all('div', class_='findSection')

    has_titles = lambda tag : 'Titles' in tag.find('h3', class_='findSectionHeader').text
    section = [section for section in sections if has_titles(section)]

    if len(section) > 0:
        result_list = section[0].find('table', class_='findList').find_all('td', class_='result_text')

    titles_html = ''

    for item in result_list:
        titles.append({
            'url': item.find('a').get('href'),
            'text': item.text
        })
        titles_html += '<b>{0}</b> - {1}<br/>'.format(len(titles), item.text)
    
    label_results.value = titles_html
    
    box_index = widgets.VBox([
        widgets.Label(value="Type the index of the desired movie to analyze:"),
        index_input
    ])
    display(box_index)

    
def handle_index_submit(sender):  
    index = int(index_input.value) - 1
    title = titles[index]
    print('Loading data for: ' + title['text'])
    parse_entry(title['url'])

search_input.on_submit(handle_search_submit)
index_input.on_submit(handle_index_submit)

In [None]:
%%html
<div id="imdb-movie-network"></div>

In [None]:
from IPython.display import Javascript
import json

# Increase the size of the first node
if 'size' in nodes[0]:
    nodes[0]['size'] *= 2

# Transform the graph into a JSON graph
data = { 'nodes': nodes, 'edges': edges }
jsonGraph = json.dumps(data, indent = 4)

# Clear nodes and edges
nodes = []
edges = []

# Send to Javascript
Javascript("""window.jsonGraph={};""".format(jsonGraph))

In [None]:
%%javascript
requirejs.config({
    paths: {
        vis: 'vis'
    }
});

require(['vis'], function(vis){
    var container = document.getElementById('imdb-movie-network');
    var options = {
        width: '900px',
        height: '500px',
        nodes: {
            shape: 'dot',
            size: 10,
            borderWidth: 2
        },
        edges: {
            font: {
                size: 12
            }
        }
    };
    
    // Load the JSON graph we generated from IPython input
    var graph = window.jsonGraph;
    
    // Display Graph
    var network = new vis.Network(container, graph, options);
});

# Twitter - Sentiment Analysis

Dependencies

In [None]:
!pip install tweepy
!pip install textblob

In [None]:
import tweepy
from textblob import TextBlob
import numpy as np

In [None]:
# Para conseguir essas chaves da API do Twitter, acessar: https://apps.twitter.com/ e criar um app
consumer_key = 'tkKbSHx4YP5lEzhGjndBlT0Z1'
consumer_secret = 'UOEG4egOiAVk3ZgcNrzI8Rn8GzBaeTFsxP9AQtwXMTqHdVOSgb'

access_token = '117218938-V27a0owN7LKEH23vJx5nAJ6QvttsANGETc2oyeAo'
access_token_secret = '0PLzG7TAX1eqmkXg90JtWaR4b5UZZIsDRKOMjRdnKEjTI'

# Autenticação
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [None]:
# Definir o que será procurado no Twitter
public_tweets = api.search(q='Wonder Woman', count = 10000)

# Array de manipulação
polarity = [] # np.array()

# Percorrer tweets
for tweet in public_tweets:
    # print(tweet.text)
    analysis = TextBlob(tweet.text)
    polarity.append(analysis.sentiment.polarity)
    
# Converte array normal para array numpy
polarity = np.array(polarity)

# Média
mean = np.mean(polarity)
print('Mean: {0}'.format(mean))

# Média desconsiderando os valores zeros
nonzero_indexs = np.nonzero(polarity)
mean_nonzero = np.mean(polarity[nonzero_indexs])
print('Mean (disregarding zero values): {0}'.format(mean_nonzero))

# Verifica se o filme é favorável, não favorável e neutro
if mean > 0 :
    print('O filme é favorável!')
elif mean < 0 :
    print('O filme não é favorável!')
else :
    print('O filme é neutro!')