In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import time
import requests
import pandas as pd

df_userIds = pd.read_parquet('IMDB_Unique_UserID.parquet')


In [2]:
def Web_Scrape(userid):
    # Goal is to Extract maximum of 50 Movie Reviews with their rating per User

    # For one user, One page contains only 25 Movie reviews with their rating
    # As the content is dynamically loaded, so have to click "load-more-button"

    options = Options()
    b = webdriver.Chrome(options=options)

    url = f"https://www.imdb.com/user/{userid}/reviews?spoiler=hide&sort=submissionDate&dir=asc&ratingFilter=0"
    b.get(url)

    try:
        btn = WebDriverWait(b, 3).until(EC.element_to_be_clickable((By.ID, "load-more-trigger")))
        btn.click()
        print('Load More button clicked')
        time.sleep(2)
    except Exception as NoSuchElementException:
        pass

    html_content = b.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    b.quit()

    Movie_reviews_div = soup.find_all("div",class_="lister-item-content")

    return Movie_reviews_div


In [3]:
# Main Div: div.lister-item-content
# Movie Name,id : div.lister-item-header , then anchor tag <a href="" >....</a>
# if more than one anchor tag then reject it:

# Review : div.content , then div.text
# Rating : div.ipl-ratings-bar , then span.rating-other-user-rating , then in that first span

def Extract_Movie_Review_Rating(data,userid,Movie_reviews_div):

    movie_div_class = "lister-item-header"
    review_div_class = "content"
    rating_div_class = "ipl-ratings-bar"

    for movie_review in Movie_reviews_div:

        movie_div = movie_review.find("div",class_=movie_div_class)
        review_div = movie_review.find("div",class_=review_div_class)
        rating_div = movie_review.find("div",class_=rating_div_class)

        anchor_tags = movie_div.find_all('a',href=True)

        # As it an episode of web series so discarding that
        if len(anchor_tags) > 1:
            continue

        if rating_div and review_div:
            
            rating = rating_div.find("span",class_="rating-other-user-rating").find("span").get_text(strip=True)
            rating = int(rating)
            
            data['UserID'].append(userid)

            movieid = anchor_tags[0]['href'].split('/')[2]
            data['MovieID'].append(movieid)

            # strip = True , remove leading and trailing whitespaces
            review = review_div.find("div",class_="text").get_text(strip=True)
            data['Review'].append(review)

            data['Rating'].append(rating)

    return data

In [4]:
start_index = 5400
rows = 700

df = df_userIds[start_index:start_index + rows]
data = {'UserID':[],'MovieID':[],'Review':[],'Rating':[]}

for _, row in df.iterrows():
    print('At index :',start_index)
    start_index = start_index + 1
    
    userid = row.iloc[0]
    # print(userid)

    Movie_reviews_div = Web_Scrape(userid)

    data = Extract_Movie_Review_Rating(data,userid,Movie_reviews_div)

    df_scrape_data = pd.DataFrame(data)

    df_scrape_data.to_parquet('Imdb_dataset_800_user_24.parquet')
    
    print(f'Extracted data for user {userid}')



At index : 5400
Extracted data for user ur0092990
At index : 5401
Extracted data for user ur0093005
At index : 5402
Extracted data for user ur0093011
At index : 5403
Extracted data for user ur0093020
At index : 5404
Load More button clicked
Extracted data for user ur0093024
At index : 5405
Extracted data for user ur0093032
At index : 5406
Extracted data for user ur0093035
At index : 5407
Extracted data for user ur0093053
At index : 5408
Extracted data for user ur0093064
At index : 5409
Extracted data for user ur0093086
At index : 5410
Extracted data for user ur0093087
At index : 5411
Extracted data for user ur0093116
At index : 5412
Extracted data for user ur0093135
At index : 5413
Extracted data for user ur0093161
At index : 5414
Load More button clicked
Extracted data for user ur0093184
At index : 5415
Extracted data for user ur0093185
At index : 5416
Extracted data for user ur0093228
At index : 5417
Extracted data for user ur0093240
At index : 5418
Extracted data for user ur0093256


In [5]:
df_dataset = pd.read_parquet('Imdb_dataset_800_user_24.parquet')

df_dataset

Unnamed: 0,UserID,MovieID,Review,Rating
0,ur0092990,tt0171063,This is a comedy more for smiling than laughin...,7
1,ur0093005,tt0203408,"As a Christian, I wanted to see a quality film...",1
2,ur0093011,tt0174852,I saw this movie on a field trip with my class...,10
3,ur0093020,tt0120724,Sure it's bad. If you aren't expecting a bad f...,9
4,ur0093024,tt0073043,one of the funniest mystery science theater 30...,1
...,...,...,...,...
2932,ur0101106,tt0120177,"Especially the scene where Clown ""John Leguiza...",10
2933,ur0101106,tt0081353,"By far, the best ever remake of a popular cart...",10
2934,ur0101106,tt0065547,As much as I like this moviewhat its storyline...,8
2935,ur0101106,tt0070643,While `The Godfather' was not Al's first movie...,10
