In [16]:
# Import the IMDb class from the imdb package
from imdb import IMDb
import requests
import time

import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By  # Import the By class
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import import_ipynb
from user import User
import db


from faker import Faker
import random

from config import TMDB_API_KEY
from imdb import Cinemagoer
from collections import deque

from datetime import datetime
# print(TMDB_API_KEY)  # Just to verify it's imported correctly; remove in production


In [17]:
def fetch_movie_names(api_key=TMDB_API_KEY, total_movies=10):
    
    base_url = "https://api.themoviedb.org/3/discover/movie"
    movie_count = 0
    page = 1
    while movie_count < total_movies:
        params = {
            "api_key": api_key,
            "language": "en-US",
            "sort_by": "revenue.desc",
            "page": page
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            for movie in data['results']:

                movie_count += 1
                yield movie['title']
                if movie_count >= total_movies:
                    break
        else:
            print(f"Failed to fetch movies: {response.status_code}")
            break
        page += 1
        time.sleep(0.5)  # Respect the rate limit
        
    # movie_names[:total_movies]



In [52]:
def scrape_imdb_movie_reviews(movie_id):

    # Specify the path to chromedriver
    service = Service(executable_path='/home/james/.wdm/drivers/chromedriver/linux64/121.0.6167.85/chromedriver-linux64/chromedriver')



    # Create ChromeOptions object
    chrome_options = webdriver.ChromeOptions()
    
    # Disable popups
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')  # Disable GPU accelerati


    
    
    # Initialize the WebDriver with the Service object
    driver = webdriver.Chrome(service=service,options = chrome_options)
    
    # Open a webpage
    driver.get('https://www.imdb.com/title/tt'+movie_id+'/reviews')

    
    # Wait for the page to load (optional, could use more sophisticated waits)
    time.sleep(10)

    # Wait for the cookies consent button to be clickable, and then click it
    wait = WebDriverWait(driver, 30)
    cookies_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Decline"]')))
    cookies_button.click()
        
    # Example: Find an element and retrieve text (update the selector as needed)
    
    # THE PAGE MIGHT HAVE A load more of the reviews getting the webdriver to test press the button.

    try:
        max_count = 10
        count = 0
        while count < max_count:  # Keep clicking the "Load More" button until it's no longer found
            time.sleep(5)
            print('loading content on page' + count)
  
            load_more_button = WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@id, 'load-more-trigger')]"))
            )

            load_more_button.click()

            count += 1
    except Exception as e:
        print("All content loaded or button not found.")



    
    
    elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'lister-item-content')]")


    review = {}
    print('reading titles')
    for element in elements:
        #return the individual reviews with their respective ratings
        edivs = element.find_elements(By.TAG_NAME, 'div')
        title_div = element.find_elements(By.TAG_NAME, 'a')
        review = {'ratings':None,'check':''}
        review['title'] = title_div[0].text
        review['movie_id'] = movie_id
        print(title_div[0].text)
        # Step 3: Retrieve and print the text for each child element
        details = {0:'ratings',1:'author',2:'content',3:'check'}
        count = 0
        for div in edivs:
            # print(div.get_attribute('outerHTML'))
            if count in [0,1,2,3]:
                #possible that there is no rating so check forms.
                if count == 0:
                    #"10/10"
                    try:

                        int(div.text.split('/')[0])
                        review[details[count]] = div.text
                    
                    except:
                        print('review has no rating, skipping')
                        break
                        
                elif count == 1:
                    data = div.text.split()
                    
                    # print(data)
                    if len(data) >= 1:
                        review['author'] = data[0]
                    if len(data) >= 2:
                        review['date'] = ' '.join(data[1:])
                else:

                    review[details[count]] = div.text

                count +=1



        if review['check'] == '':
            #no actual review in these cases.
            review['content'] = None
            
        del review['check']
        print(review)
        if review['ratings'] != None:
            yield review

    
    # Close the browser
    driver.quit()



In [53]:

# mix of imdb and tmdb
def search_movie_by_name(movie_name, api_key=TMDB_API_KEY):
    """Searches for a movie by name on TMDb, prints the first result, and fetches reviews including ratings."""
    # Search for the movie

    movie = {'title':None,'release_date':None,'genre':'','director':'','actors':[],'ratings':0,'votes':0,'overview':None,'id':None, 'reviews' : []}
    ia = Cinemagoer()
    # ia = IMDb()
    movies = ia.search_movie(movie_name)
    if movies:
    # Get the first movie from the list
        
        mov = movies[0]
        movkeys = mov.keys()
        # print(movkeys)
        # Retrieve detailed information about the movie

        movie['title'] = movie_name
        ia.update(mov)
        if 'genre' in movkeys:
            movie['genre'] = ', '.join(mov['genres'])
        if 'year' in movkeys:
            movie['release_date'] = datetime(mov['year'],1,1)
        if 'directors' in movkeys:
            movie['director'] = ', '.join([director['name'] for director in mov['directors']])
        #temporary, even if empty string.
        movie['director_id'] = id(movie['director'])
            
        if 'actors' in movkeys:
            movie['actors'] = [cast['name'] for cast in mov['cast']][:5]
        if 'plot' in movkeys:        
            movie['story'] = mov['plot'][0]
        
        if 'ratings' in movkeys and 'votes' in movkeys:
            
            movie['ratings'] = mov.get('rating')
            movie['votes'] = mov.get('votes')

        movie['reviews'] = [fd for fd in scrape_imdb_movie_reviews(mov['imdbID'])]
        movie['id'] = mov['imdbID']
        return movie
    #fill in the details

    else:
        print("No movies foundwhat is colla matching the query.")

# search_movie_by_name('Avengers: Endgame')['reviews'][9]

In [54]:
def generate_users(cc,count):

    for i in range(0,count):
        yield User(Faker(random.choice(cc)).name())


# for us in generate_users(["ar_AA"],1):
#     print(us.name())

In [65]:
def main():

    #create a set of movie_names
    movie_names = {mn for mn in fetch_movie_names(total_movies = 30)}
    print(movie_names)
    movies = [search_movie_by_name(mn) for mn in movie_names]
    # print(movies)
    
    #sort or filter by number of reviews?
    
    country_codes = [
        "ar_AA", "az_AZ", "bg_BG", "bs_BA", "cs_CZ", "de_DE", "dk_DK", "el_GR",
        "en_AU", "en_CA", "en_GB", "en_IN", "en_NZ", "en_US", "es_ES", "es_MX",
        "et_EE", "fa_IR", "fi_FI", "fr_FR", "hi_IN", "hr_HR", "hu_HU", "hy_AM",
        "id_ID", "it_IT", "ja_JP", "ka_GE", "ko_KR", "lt_LT", "lv_LV", "ne_NP", 
        "nl_NL", "no_NO", "pl_PL", "pt_BR", "pt_PT", "ro_RO", "ru_RU", "sk_SK",
        "sl_SI", "sv_SE", "tr_TR", "uk_UA", "zh_CN", "zh_TW"
    ]
    
    users = [user for user in generate_users(country_codes,100)]
    # print([user.name for user in users])
    # #need a queue of reviews
    # from collections import deque

    # # Initialize a deque as a queue
    # # need a q for each set of movie reviews so I can distribute only one to each user. 
    
    review_q = deque([deque(mov['reviews']) for mov in movies])
    # # print(review_q)

    # # hand out one to each user of a movie type..

    user_q = deque(users)
    leave_q = deque([])
    while review_q:
        movier_q = review_q.popleft()
        #shuffle users before handing out
        random.shuffle(user_q)

        while user_q:
            
            # if there is anything to hand out.
            user = user_q.popleft()
            if movier_q:
                review = movier_q.popleft()
                review['user_id'] = user.user_id
                user.add_review(review)
                # print(user.name,'has new review',review)
            leave_q.append(user)
            # print(len(user_q))
            # print(len(leave_q))

        #done for that subqueue
        user_q = leave_q
        leave_q = deque([])
    

    
    #now do the DB
    # db.drop_tables()
    # db.create_db()
    for user in user_q:
        db.add_user(user)

    for movie in movies:
        # print(movie)
        # print(movie['title'])
        # print(movie['director_id'])
        db.add_movie(movie)
        for review in movie['reviews']:
            # print(review.keys())
            
            db.add_movie_review(review)
            
        print(db.display_size())

    
    
    

In [66]:
main()

{'Black Panther', 'Frozen', 'Spider-Man: Far From Home', 'The Avengers', 'Titanic', 'Skyfall', 'Barbie', 'Avengers: Endgame', 'Iron Man 3', 'Captain America: Civil War', 'Aquaman', 'Harry Potter and the Deathly Hallows: Part 2', 'Furious 7', 'Minions', 'Incredibles 2', 'The Super Mario Bros. Movie', 'Jurassic World: Fallen Kingdom', 'The Lion King', 'Frozen II', 'The Fate of the Furious', 'Avengers: Infinity War', 'Beauty and the Beast', 'Top Gun: Maverick', 'Star Wars: The Force Awakens', 'Jurassic World', 'Spider-Man: No Way Home', 'Avatar', 'Avengers: Age of Ultron', 'Avatar: The Way of Water', 'Star Wars: The Last Jedi'}
All content loaded or button not found.
reading titles
I wish I enjoyed it like so many did, it just bored me.
{'ratings': '3/10', 'title': 'I wish I enjoyed it like so many did, it just bored me.', 'movie_id': '1825683', 'author': 'Sleepin_Dragon5', 'date': 'April 2018', 'content': "I feel so bad giving this film a poor rating, because I feel it's important for ma

In [67]:
print(db.display_size())

Users: 100

Movies: 30

Reviews: 713
None


In [61]:
print(db.display_data())

Users:
thisuser (140440115366736, 'Pietra Melo')
thisuser (140440115367216, 'धालीवाल, अभय')
thisuser (140440115367360, 'Նաիրի Քալանթարյան')
thisuser (140440115368176, 'Hannah Francis')
thisuser (140440115368944, 'Rodney Stokes')
thisuser (140440115370336, 'Émilie Le Goff')
thisuser (140440115371392, '최지원')
thisuser (140440115371488, 'Ülle Jaakson')
thisuser (140440115371680, 'Kathleen Wood')
thisuser (140440115372064, 'Gioacchino Morricone-Bonomo')
thisuser (140440115373024, 'Malle Remmel')
thisuser (140440115373120, 'Matias Virtanen')
thisuser (140440115373216, 'Zora Šipek')
thisuser (140440115373456, 'Maarit Luukkonen')
thisuser (140440115373504, 'Colleen Craig-Craig')
thisuser (140440115373984, 'Jill Mitchell')
thisuser (140440115374560, 'Elin Samuelsson')
thisuser (140440115375904, 'Senn Bertho-Lucas')
thisuser (140440115376576, 'Jason Guerra')
thisuser (140440115377152, 'Casey Thompson')
thisuser (140440115377632, 'إخلاص أنمار')
thisuser (140440115377824, 'Божо Младенов')
thisuser