In [6]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from collections import defaultdict

In [7]:
def readPage(year_page_url):
    response = requests.get(year_page_url)
    page = response.text
    soupPage = BeautifulSoup(page, 'lxml')
  
    return soupPage

In [8]:
## create function to scrape page
## use function to scrape first page
## then go to the second page (find link for "Next") and use
## the same function to scrape the second page

def scrapeYearPage(page_url):
    page_data = readPage(page_url)
    
    movie_name_list = []
    movie_link_list = []
    
    movie_data = page_data.find_all('a', href=re.compile('^/title'), title='')
    next_link = 'http://www.imdb.com' + page_data.find(text=re.compile('Next')).parent['href']
    
    for movie in movie_data:
        movie_name_list.append(movie.text)
        movie_link_list.append('http://www.imdb.com' + movie['href'])
        
    
    return list(zip(movie_name_list, movie_link_list)), next_link
    #return data and next link

#print(scrapeYearPage('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us,desc&title_type=feature&year=2015,2015'))

    


In [9]:
def scrapeYearMultiPages(first_page_url, num_pages=2):
    movie_list = []
    next_page_url = first_page_url
    
    for _ in range(num_pages):
        movies, next_page_url = scrapeYearPage(next_page_url)
        for movie in movies:
            movie_list.append(movie)
            
    return movie_list

#print(scrapeYearMultiPages('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us,desc&title_type=feature&year=2015,2015'))
    

In [10]:
def getMovieValue(soup, itemprop_value):
    value = soup.find('span', itemprop=itemprop_value)
    return value.text
    

In [11]:
#scrape each movie page for ratings

def scrapeMoviePage(page_link):
    page = readPage(page_link)
    
    ratingValue = float(getMovieValue(page, 'ratingValue'))
    ratingCount = int(getMovieValue(page, 'ratingCount').replace(',', ''))
    
    return [ratingValue, ratingCount]
    
    
    



In [82]:
def scrapeAllMoviesPagesByYear(year_page_url, num_pages=2):
    movie_names_links = scrapeYearMultiPages(year_page_url, num_pages)
    
    movie_names = []
    movie_ratings = []
    movie_ratings_count = []
    
    for movie in movie_names_links:
        rating_and_count = scrapeMoviePage(movie[1])
        movie_names.append(movie[0])
        movie_ratings.append(rating_and_count[0])
        movie_ratings_count.append(rating_and_count[1])
    
    return [movie_names, movie_ratings, movie_ratings_count]
        
    
#print(scrapeAllMoviePages('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us,desc&title_type=feature&year=2015,2015'))
    

In [13]:
def generateIMDBDataFrame(year_page_url):
    movie_data = scrapeAllMoviesByPages(year_page_url)
    header = ['Title', 'Rating', 'RatingCount']
    movie_data_dict = defaultdict(list)

    for _ in range(len(header)):
        movie_data_dict[header[_]] = movie_data[_]
        
    movie_df = pd.DataFrame(movie_data_dict)
    movie_df = movie_df[header]
    
    return movie_df

In [14]:
def writeIMDBYearToCSV(year_page_url, outfilename):
    movie_df = generateIMDBDataFrame(year_page_url)
    
    #print(movie_df.head())
    
    movie_df.to_csv(outfilename)
    
#writeIMDBToCSV('http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us,desc&title_type=feature&year=2015,2015', '2015_movies_user_ratings_imdb.csv')




## TESTS FOR SCRAPING SEARCH PAGES


In [50]:
movies_based_on_books = 'http://www.imdb.com/search/keyword?keywords= \
                         based-on-novel&mode=advanced&page=1&title_type= \
                         movie&ref_=kw_vw_adv&sort=user_rating,desc'

In [77]:
def scrapeSearchPage(page_url):
    bad_strings = ['See full summary', 'See full synopsis']
    page_data = readPage(page_url)
    
    movie_name_list = []
    movie_link_list = []
    
    movie_data = page_data.find_all('a', href=re.compile('^/title'), title='', text=True)
    next_link = 'http://www.imdb.com/search/keyword' + page_data.find(text=re.compile('Next »')).parent['href']
    
    for movie in movie_data:
        #print(movie)
        if movie.text not in bad_strings:
            movie_name_list.append(movie.text)
            movie_link_list.append('http://www.imdb.com' + movie['href'])
        
    return list(zip(movie_name_list, movie_link_list)), next_link

In [80]:
def scrapeSearchMultiPages(first_page_url, num_pages=10):
    movie_list = []
    next_page_url = first_page_url
    
    for _ in range(num_pages):
        movies, next_page_url = scrapeSearchPage(next_page_url)
        for movie in movies:
            
                movie_list.append(movie)
            
    return movie_list

In [72]:
p = scrapeSearchMultiPages(movies_based_on_books)

In [73]:
print(len(p))

100


In [86]:
def scrapeAllMoviesPagesFromSearch(search_page_url, num_pages=10):
    movie_names_links = scrapeSearchMultiPages(search_page_url, num_pages)
    
    movie_names = []
    movie_ratings = []
    movie_ratings_count = []
    
    for movie in movie_names_links:
        rating_and_count = scrapeMoviePage(movie[1])
        movie_names.append(movie[0])
        movie_ratings.append(rating_and_count[0])
        movie_ratings_count.append(rating_and_count[1])
    
    return [movie_names, movie_ratings, movie_ratings_count]

In [84]:
def generateIMDBDataFrameSearches(year_page_url):
    movie_data = scrapeAllMoviesPagesFromSearch(year_page_url)
    header = ['Title', 'IMDBRating', 'IMDBRatingCount']
    movie_data_dict = defaultdict(list)

    for _ in range(len(header)):
        movie_data_dict[header[_]] = movie_data[_]
        
    movie_df = pd.DataFrame(movie_data_dict)
    movie_df = movie_df[header]
    
    return movie_df

In [85]:
def writeIMDBSearchesToCSV(year_page_url, outfilename):
    movie_df = generateIMDBDataFrameSearches(year_page_url)
    
    #print(movie_df.head())
    
    movie_df.to_csv(outfilename)