In [57]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
from operator import itemgetter
import networkx as nx

from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
init_notebook_mode(connected=True)

import re

import plotly.graph_objs as go

In [58]:
def get_imdb_data(url = 'https://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=A0PEGTD2F2V8E4EVT1CC&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4'):
    '''
    Get the details information about movies
    
    Parameters:
    url (string): url of the movie which about we want to collect information
    
    Returns:
    tuple (title(string) , movies_like_this(list) , summary(string))
    
    '''
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    title = soup.find('h1')
    title = title.text[0: int(str(title.text).index('('))-1]

    summary = soup.find('div', {'class':'summary_text'}).text
    summary = summary.replace('\n', ' ').replace('\r', '')
    summary = re.sub(' +', ' ', summary)

    like_this = soup.find_all('div', {'class':'rec_page'})
    movies_like_this = []
    
    for div in like_this:
        links = div.findAll('a')
        for a in links:
            movies_like_this.append("https://www.imdb.com/" + a['href'])
            
    return title, movies_like_this, summary

In [59]:
# Initial data , movie's title , summary and url
START = "The Dark Knight"
Summary = 'When the menace known as The Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.'
url = 'https://www.imdb.com/title/tt0468569/'

# Create an empty Graph
F = nx.DiGraph()

# For snawball sampling
to_scrape_list = [(0, START, url)]
to_scrape_set = set(to_scrape_list[0][0:2])

# Collect summary of each movie for TF-IDF Statistic
Summary_list = [(START , Summary)]

# Initial step for snawball sampling
layer, title,  url = to_scrape_list[0]

# Loop on three layers as we need to collect the data of 3 layer movies
while layer < 3:
    del to_scrape_list[0]
    
    movie = get_imdb_data(url)
    
    for link in movie[1]:
        movie_like_parent = get_imdb_data(link)
        
        sub_movie_title = movie_like_parent[0]
        sub_movie_summary = movie_like_parent[2]
        Summary_list.append((sub_movie_title, sub_movie_summary))
        
        to_scrape_list.append((layer + 1,sub_movie_title,  link))
        to_scrape_set.add(sub_movie_title)
        
        F.add_edge(title, sub_movie_title)
        
    layer,title, url = to_scrape_list[0]

In [132]:
# Save Obtained graph to the disk
nx.write_graphml(F, "movies.graphml")


In [61]:
Summary_set = set(Summary_list)

In [77]:
Summary_df = pd.DataFrame(Summary_set, columns=['Title', 'Summary'])

In [None]:
Summary_df.to_csv('Summary.csv', index=False)