## Import Packages

In [1]:
import pandas as pd
import numpy as np
from imdb import IMDb
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Progress bar library

## Read in Data

In [2]:
#pos_urls = pd.read_csv('../data/aclImdb/aclImdb/train/urls_pos.txt', header=None)
#neg_urls = pd.read_csv('../data/aclImdb/aclImdb/train/urls_neg.txt', header=None)
unsup_urls = pd.read_csv('../data/aclImdb/train/urls_unsup.txt', header=None)
#pd.concat([pos_urls,neg_urls,unsup_urls])

In [3]:
ids = [(url.split('/')[4].replace('tt',''), url) for url in unsup_urls[0].to_list()]
distinct_ids = list(set(ids))

## Get TV/Movie Info

In [None]:
# Create an instance of the IMDb class
ia = IMDb()

# Function to get movie details with retry logic and delay
def get_movie_details(id, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            # Get the movie by IMDb ID
            movie = ia.get_movie(str(id[0]))
            
            # Collect details about the movie
            movie_details = {
                "ID": id[0],
                "URL": id[1],
                "Title": movie['title'],
                "Year": movie['year'],
                "Rating": movie.get('rating', 'N/A'),
                "Genres": ", ".join(movie.get('genres', []))
            }
            
            return movie_details
        except Exception as e:
            attempt += 1
            print(f"Error for {id}: {str(e)}. Retrying ({attempt}/{retries})...")
    
    return {"Error": f"Failed to retrieve {id} after {retries} attempts."}

# List to store the results
movie_results = []

# Using ThreadPoolExecutor to parallelize with two workers
with ThreadPoolExecutor(max_workers=5) as executor:
    # Use tqdm to show progress bar while processing the IDs
    results = list(tqdm(executor.map(get_movie_details, distinct_ids), total=len(distinct_ids), desc="Processing Movies"))
    
    # Append the results to the movie_results list
    movie_results.extend(results)

Processing Movies:  10%|▉         | 694/7091 [04:37<52:43,  2.02it/s]  

In [None]:
pd.DataFrame(movie_results).to_csv('../data/movie_results.csv', index=False)