### import packages

In [None]:
import re
import requests
import time
import csv
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

### Generate the url that we can get a list of movies

Recent 10 years: From 2009-01-01 to 2019-10-31

In [None]:
def generate_movie_list_link(i):
    movie_list_url = "https://www.imdb.com/search/title/?title_type=tv_movie&release_date=2009-01-01,2019-10-31&sort=num_votes,desc&start="+str((i-1)*50+1)+"&ref_=adv_nxt"
    return  movie_list_url

### Get the id of each movie in the list

In [None]:
def get_movie_id(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html,'html.parser')
            movies = soup.select('.lister-item-content')
            movie_id_list = [[0] * 1 for j in range(50)]
            i = 0
            for movie in movies:
                header = movie.select_one('.lister-item-header')
                movie_link = header.select_one('a')['href']                     
                id_pattern = re.compile(r'(?<=tt)\d+(?=/?)')
                movie_id = int(id_pattern.search(movie_link).group())           #imdb id
                
                movie_id_list[i] = movie_id
                
                i += 1
                
                #print(i,movie_id,movie_reviews_url)
                #time.sleep(1)
            return movie_id_list
        else:
            print("Error when request URL")
    except RequestException:
        print("Request Failed")
        return None

### Get information for each movie

In [None]:
from imdb import IMDb
def get_movie_info(movie_id):
    # create an instance of the IMDb class
    ia = IMDb()
    # get a movie
    movie = ia.get_movie(movie_id)
    movie_key_words = ia.get_movie(movie_id, info = 'keywords')
    try:
        # get the names of the movie
        title = [movie['title']]
        if title == []:
            title = ["NA"]      
        # get the release year of the movie
        release_year = [movie['year']]
        if release_year == []:
            release_year = ["NA"]
        # get the genres of the movie
        genre = ["|".join([genres for genres in movie['genres']])]
        if genre == []:
            genre = ["NA"]     
        # get the key words of the movie
        key_words = ["|".join([key_word for key_word in movie_key_words['keywords']])]
        if key_words == []:
            key_words = ["NA"]
        # get the plot of the movie
        plot = [plot.split("::", 1)[0] for plot in movie.get('plot')]
        plot = [plot[0]]
        if plot == []:
            plot = ["NA"]
        # get the run time of the movie
        runtime = movie['runtimes']
        if runtime == []:
            runtime = ["NA"]
        # get the votes of the movie
        vote = [movie['votes']]
        if vote == []:
            vote = ["NA"]
        # get the rating of the movie
        rating = [movie['rating']]
        if rating == []:
            rating = ["NA"]
        movie_info_list = title + release_year + genre + key_words + plot + runtime + vote + rating
        return movie_info_list
    except:
        print("Request Failed")
        return None

### Scrape and put the result into a csv file


In [None]:
if __name__ == '__main__':
    i = 0
    with open('movie_info.csv', 'w', newline="",encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile, dialect=("excel"))
        csvwriter.writerow(["title", "release year", "genre", "key words" , "plot", 'run time /min', "number of votes" , "rating"])
        while (i <= 199):
            print("Page" + str(i+1) + " of movies")
            movie_list_url = generate_movie_list_link(i + 1)
            movie_url_list =  get_movie_id(movie_list_url)
            j = 0

            while (j < 50):
                print("Movie" + str(i * 50 + j + 1) + " information")
                l = get_movie_info(movie_url_list[j])
                if(l is None):
                     j += 1
                     continue
                else:
                     csvwriter.writerow(l)
                     j += 1
            i += 1