### Extract reviews for movies from IMDb

#### Libraraies

In [92]:
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
import warnings
import os.path
warnings.filterwarnings("ignore")

#### Movies with sequals (with IMDb URL)

In [88]:
# Harry Potter Series
hp0_url = 'https://www.imdb.com/title/tt0241527/reviews?ref_=tt_urv'
hp1_url = 'https://www.imdb.com/title/tt0295297/reviews?ref_=tt_urv'
hp2_url = 'https://www.imdb.com/title/tt0304141/reviews?ref_=tt_urv'
hp3_url = 'https://www.imdb.com/title/tt0330373/reviews?ref_=tt_urv'
hp4_url = 'https://www.imdb.com/title/tt0373889/reviews?ref_=tt_urv'
hp5_url = 'https://www.imdb.com/title/tt0417741/reviews?ref_=tt_urv'
hp6_url = 'https://www.imdb.com/title/tt0926084/reviews?ref_=tt_urv'
hp7_url = 'https://www.imdb.com/title/tt1201607/reviews?ref_=tt_urv'

# Star Wars
sw0_url = 'https://www.imdb.com/title/tt0120915/reviews?ref_=tt_urv'
sw1_url = 'https://www.imdb.com/title/tt0121765/reviews?ref_=tt_urv'
sw2_url = 'https://www.imdb.com/title/tt0121766/reviews?ref_=tt_urv'
sw3_url = 'https://www.imdb.com/title/tt0076759/reviews?ref_=tt_urv'
sw4_url = 'https://www.imdb.com/title/tt0080684/reviews?ref_=tt_urv'
sw5_url = 'https://www.imdb.com/title/tt0086190/reviews?ref_=tt_urv'
sw6_url = 'https://www.imdb.com/title/tt2488496/reviews?ref_=tt_urv'
sw7_url = 'https://www.imdb.com/title/tt2527336/reviews?ref_=tt_urv'
sw8_url = 'https://www.imdb.com/title/tt2527338/reviews?ref_=tt_urv'

# aggregate urls
url_dict = {
    'harry_potter': [hp0_url, hp1_url, hp2_url, hp3_url, hp4_url, hp5_url, hp6_url, hp7_url],
    'star_wars': [sw0_url, sw1_url, sw2_url, sw3_url, sw4_url, sw5_url, sw6_url, sw7_url, sw8_url]
}

#### Extract and save reviews

In [113]:
# return reviews, titles, and rating as a dataframe 
def get_df(url):
    # webdriver file
    PATH = r"C:\chromedriver.exe"
    driver = webdriver.Chrome(PATH)
    driver.get(url)

    # load pages
    sel = Selector(text = driver.page_source)
    review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0]
    more_review_pages = int(int(review_counts)/25)
    print(f'there are {review_counts} reviews')
    for _ in tqdm(range(more_review_pages)):
        try:
            css_selector = 'load-more-trigger'
            driver.find_element(By.ID, css_selector).click()
            time.sleep(3)
        except:
            pass

    # get lists
    rating_list = []
    review_title_list = []
    review_list = []
    error_url_list = []
    error_msg_list = []
    reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

    for d in tqdm(reviews):
        try:
            sel2 = Selector(text = d.get_attribute('innerHTML'))
            # review rating
            try:
                rating = sel2.css('.rating-other-user-rating span::text').extract_first()
            except:
                rating = np.NaN
            # review text
            try:
                review = sel2.css('.text.show-more__control::text').extract_first()
            except:
                review = np.NaN

            # review title
            try:
                review_title = sel2.css('a.title::text').extract_first()
            except:
                review_title = np.NaN

            # append info
            rating_list.append(rating)
            review_title_list.append(review_title[1:-1])
            review_list.append(review.replace('\n', '<br /><br />'))

        except Exception as e:
            error_url_list.append(url)
            error_msg_list.append(e)

    # make the lists equal length
    if not len(rating_list) == len(review_list) == len(review_title_list):
        print('columns have different lengths...')
        min_length = min(len(rating_list), len(review_list), len(review_list))
        rating_list = rating_list[:min_length]
        review_list = review_list[:min_length]
        review_title_list = review_title_list[:min_length]

    # convert to df
    review_df = pd.DataFrame({
        'Rating': rating_list,
        'Review_Title': review_title_list,
        'Review': review_list
    })
    return review_df

# convert df to csv
def save_csv(df, filepath):
    df.to_csv(filepath)

### Do the actual scrapping

In [None]:
for series, urls in url_dict.items():
    print(f'Processing {series}...')
    index = 0
    for url in urls:
        filepath = f'./{series}/{index}.csv'
        try:
            f = open(filepath)
            f.close()
            print(f'{filepath} already exists')
        except FileNotFoundError:
            print(f'working on {filepath}...')
            review_df = get_df(url=url)
            save_csv(df=review_df, filepath=filepath)
        index += 1