In [1]:
import pandas as pd
import numpy as np
import sys
import os


import json

import threading

In [2]:
class MultiThreading:

    def __init__(self, threads, iteration_list, output=[]):
        self.threads = threads
        self.output = output
        self.iteration_list = iteration_list

    def multi_thread_compile(self, thread_count, function):

        """a function that compiles an iteration list to prepare
        multi threadding"""

        jobs = []

        # distribute iteration list to batches and append to jobs list
        batches = [i.tolist() for i in np.array_split(self.iteration_list, thread_count)]

        for i in range(len(batches)):
            jobs.append(threading.Thread(target=function, args=[batches[i]]))

        return jobs

    def multi_thread_execute(self, jobs):

        """executes the multi-threading loop"""

        # Start the threads
        for j in jobs:
            j.start()

        # Ensure all of the threads have finished
        for j in jobs:
            j.join()
        return

    def Run(self, function):

        jobs = self.multi_thread_compile(self.threads, function)
        self.multi_thread_execute(jobs)


In [3]:
import tmdbsimple as tmdb
tmdb.API_KEY = '03ba24122703f9cff69789b9ebbd342e'

# New Class - check

In [17]:
import tmdbsimple as tmdb

import pandas as pd
from tqdm import tqdm


def get_movie_payload():
    
    movie_payload = {'keywords':{'movie_response':[],'cols':['id','keywords'],'results_parsed':[]},
                     'reviews':{'movie_response':[],'cols':['id','results'],'results_parsed':[]},
                     'info':{'movie_response':[],'cols':['id','budget', 'revenue','genres','production_countries','tagline'],'results_parsed':[]}}
    
    return movie_payload



class TMDBMovieScraper:
    def __init__(self, years_check: list):
        """
        Initializes the TMDBMovieScraper class with a list of years to iterate over for querying movie data from TMDB.
        
        Args:
            years_check (list): List of years to iterate over for querying movie data.
        """
        self.discover_api = tmdb.Discover()  # Instantiate tmdb.Discover module
        self.years_check = years_check  # List of years to iterate over
        self.discover_results = []  # Results for movie discovery
        self.movie_payload = get_movie_payload()  # Get an empty movie payload

    def _check_page_counts(self, discover_api, year: int):
        """
        Helper method to check the number of response pages to iterate over for a specific year.
        
        Args:
            discover_api (tmdb.Discover): Instance of tmdb.Discover module.
            year (int): Year to query.
        
        Returns:
            total_pages_to_loop (int): Number of response pages to iterate over.
        """
        total_pages_to_loop = discover_api.movie(year=year, page=1)['total_pages']
        return total_pages_to_loop

    def _request_discover_data(self, years_check: list):
        """
        Helper method to query movie data using the discover API.
        
        Args:
            years_check (list): List of years to iterate over for querying movie data.
        """
        for year in years_check:  # For each year
            try:
                total_pages_to_loop = self._check_page_counts(self.discover_api, year)  # Check number of pages

                for page in range(1, total_pages_to_loop):  # For each page in a given year
                    try:
                        movie_results = self.discover_api.movie(primary_release_year=year,
                                                                page=page,
                                                                with_original_language='en',
                                                                include_adult=False,
                                                                vote_count_gte=100)

                        self.discover_results.append(movie_results['results'])  # Append results to a list
                    except:
                        pass
            except:
                pass

    def _request_movie_data(self, movie_ids: list):
        """
        Helper method to query movie details using movie IDs.
        
        Args:
            movie_ids (list): List of movie IDs.
        """
        for movie_id in movie_ids:  # Iterate over each movie ID
            try:
                for k, v in self.movie_payload.items():  # Iterate over each key and value in payload
                    v['movie_response'] = self._get_film_responses(movie_id, k)  # Get film response from API instance

                    v['results_parsed'].append(self._parse_movie_response(v['movie_response'], v['cols']))
            except:
                pass

    def _transform_discover_results(self, discover_results):
        """
        Helper method to transform discover results into a DataFrame.
        
        Args:
            discover_results: Results from movie discovery.
        
        Returns:
            discover_df (pd.DataFrame): Transformed DataFrame of discover results.
        """
        discover_df = pd.concat(pd.DataFrame(i) for i in discover_results)
        discover_df = discover_df[['id', 'title', 'overview', 'popularity', 'release_date', 'vote_average', 'poster_path']]
        discover_df['release_year'] = discover_df['release_date'].apply(lambda x: x.split("-")[0])
        
        return discover_df.set_index('id')

    def _transform_movie_results(self):
        """
        Helper method to transform movie results into a DataFrame.
        
        Returns:
            movie_df (pd.DataFrame): Transformed DataFrame of movie results.
        """
        results = [pd.DataFrame(self.movie_payload[k]['results_parsed']).set_index('id') for k, v in self.movie_payload.items()]
        movie_df = results[0].join(results[1]).join(results[2])
        return movie_df

    def _get_movie_ids(self, discover_dataframe):
        """
        Helper method to retrieve movie IDs from the discover DataFrame.
        
        Args:
            discover_dataframe (pd.DataFrame): DataFrame of discover results.
        
        Returns:
            movie_ids (list): List of movie IDs.
        """
        movie_ids = list(discover_dataframe.index)
        return list(set(movie_ids))

    def _get_film_response(self, movie_id):
        """
        Helper method to get film response from TMDB API.
        
        Args:
            movie_id: ID of the movie.
        
        Returns:
            film_response: Film response from TMDB API.
        """
        film_response = tmdb.Movies(movie_id)
        return film_response

    def _get_film_responses(self, movie_id, data_stream='info'):
        """
        Helper method to get film responses based on data stream type.
        
        Args:
            movie_id: ID of the movie.
            data_stream (str): Data stream type.
        
        Returns:
            film_response: Film response based on the specified data stream.
        """
        film_response = self._get_film_response(movie_id)
        if data_stream == 'reviews':
            return film_response.reviews()
        elif data_stream == 'info':
            return film_response.info()
        elif data_stream == 'keywords':
            return film_response.keywords()

    def _parse_movie_response(self, response_dic, cols_needed):
        """
        Helper method to parse movie response based on the required columns.
        
        Args:
            response_dic: Movie response dictionary.
            cols_needed (list): Required columns.
        
        Returns:
            parsed_response (dict): Parsed movie response containing the required columns.
        """
        parsed_response = {k: v for k, v in response_dic.items() if k in cols_needed}
        return parsed_response

    def _dict_to_list(self, x, key_name):
        """
        Helper method to convert a dictionary to a list based on a specific key.
        
        Args:
            x: Input dictionary.
            key_name (str): Key name.
        
        Returns:
            converted_list: List converted from the dictionary.
        """
        try:
            return [i[key_name] for i in x]
        except:
            return x

    def _merge_clean_and_filter(self):
        """
        Helper method to merge, clean, and filter the movie DataFrame.
        
        Returns:
            df (pd.DataFrame): Merged, cleaned, and filtered DataFrame.
        """
        self.df = self.discover_df.join(self.movie_df)
        
        cols = {'results': 'content', 'genres': 'name', 'production_countries': 'name', 'keywords': 'name'}
        
        for k, v in cols.items():
            self.df[k] = self.df[k].apply(lambda x: self._dict_to_list(x, v))
            
        self.df = self.df.rename(columns={'id': 'tmdb_id', 'title': 'movie'})
        
        return self.df

    def get_movies(self):
        """
        Method to initiate the data retrieval process.
        """
        print(f'Getting data for {self.years_check}')
        mt = MultiThreading(10, self.years_check, None)
        mt.Run(self._request_discover_data)
        
        self.discover_df = self._transform_discover_results(self.discover_results)
        self.movie_ids = self._get_movie_ids(self.discover_df)
        print(f'Getting movie data for {len(self.movie_ids)} movies..')
        mt = MultiThreading(10,self.movie_ids,None)
        mt.Run(self._request_movie_data)
        
        self.movie_df = self._transform_movie_results()
        self.df_final = self._merge_clean_and_filter()
        print('Done')
        
        return self.df_final
        

        
        
        


In [18]:
ts = TMDBMovieScraper([1999,2000])
df = ts.get_movies()

Getting data for [1999, 2000]
Getting movie data for 378 movies..
Done


In [19]:
df.head()

Unnamed: 0_level_0,movie,overview,popularity,release_date,vote_average,poster_path,release_year,keywords,results,budget,genres,production_countries,revenue,tagline
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
497,The Green Mile,A supernatural tale set on death row in a Sout...,77.366,1999-12-10,8.5,/o0lO84GI7qrG6XFvtsPOSV7CTNa.jpg,1999,"[southern usa, mentally disabled, based on nov...",[],60000000.0,"[Fantasy, Drama, Crime]",[United States of America],286801374.0,Miracles do happen.
863,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",69.825,1999-10-30,7.6,/2MFIhZAW0CVlEQrFyqwa4U6zqJP.jpg,1999,"[museum, prosecution, identity crisis, airplan...","[When I was a kid, I remember thinking that _T...",90000000.0,"[Animation, Comedy, Family]",[United States of America],497366869.0,The toys are back!
550,Fight Club,A ticking-time-bomb insomniac and a slippery s...,60.129,1999-10-15,8.4,/a26cQPRhJPX6GbWfQbvZdrrp9j9.jpg,1999,"[based on novel or book, support group, dual i...",[Pretty awesome movie. It shows what one craz...,63000000.0,"[Drama, Thriller, Comedy]",[United States of America],100853753.0,Mischief. Mayhem. Soap.
603,The Matrix,"Set in the 22nd century, The Matrix tells the ...",57.101,1999-03-30,8.2,/aOIuZAjPaRIE6CMzbazvcHuHXDc.jpg,1999,"[saving the world, artificial intelligence, ma...",[The Martix is a great example of a movie that...,63000000.0,"[Action, Science Fiction]",[United States of America],463517383.0,Welcome to the Real World.
564,The Mummy,Dashing legionnaire Rick O'Connell stumbles up...,56.262,1999-04-16,6.9,/yhIsVvcUm7QxzLfT6HW2wLf5ajY.jpg,1999,"[library, secret passage, cairo, egypt, pastor...","[Trying to cram action, adventure, fantasy, ro...",80000000.0,"[Adventure, Action, Fantasy]",[United States of America],415885488.0,The legend you know. The adventure you have ye...


In [25]:
import concurrent.futures

class TMDBCastCrewScraper(TMDBMovieScraper):
    
    def __init__(self, movie_ids: list, popularity_threshold = 1, max_threads=30):
        self.movie_ids = movie_ids
        self.cast_results = []
        self.crew_results = []
        self.popularity_threshold = popularity_threshold
        self.max_threads = max_threads


    def _append_data_to_list(self,list_dic_results, popularity_threshold, results, movie_id):
        for dic in list_dic_results:
            if dic['popularity'] >= self.popularity_threshold:
                dic['tmdb_id'] = movie_id
                results.append(dic)


    def _get_cast_crew_for_movie(self, movie_id):
        try:
            film_response = self._get_film_response(movie_id)
#             crew_dic_list = film_response.credits()['crew']
            cast_dic_list = film_response.credits()['cast']
#             self._append_data_to_list(crew_dic_list,self.popularity_threshold,self.crew_results,movie_id)
            self._append_data_to_list(cast_dic_list,self.popularity_threshold,self.cast_results,movie_id)

        except:
            pass

    def _request_cast_crew_data(self, movie_ids):
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            futures = []
            for movie_id in movie_ids:
                futures.append(executor.submit(self._get_cast_crew_for_movie, movie_id))
            concurrent.futures.wait(futures)
        return
    
    def get_cast_crew(self):
        self._request_cast_crew_data(self.movie_ids)
        self.df = pd.DataFrame(self.cast_results)
        

mod = TMDBCastCrewScraper(df.index)
mod.get_cast_crew()

In [26]:
mod.df

Unnamed: 0,adult,gender,id,known_for_department,name,original_name,popularity,profile_path,cast_id,character,credit_id,order,tmdb_id
0,False,2,21593,Acting,Jason Biggs,Jason Biggs,14.776,/3nRgOQylYbNuDiddtNoh8uWTz5P.jpg,16,Jim Levenstein,52fe4336c3a36847f8042cb7,0,2105
1,False,2,21594,Acting,Chris Klein,Chris Klein,9.291,/56kCtPVfvjm5IRPa2GyaQHTPIOy.jpg,17,Chris 'Oz' Ostreicher,52fe4336c3a36847f8042cbb,1,2105
2,False,2,21403,Acting,Thomas Ian Nicholas,Thomas Ian Nicholas,7.300,/hQo2P9vwjJW6A1b327nl71a2JEW.jpg,18,Kevin Meyers,52fe4336c3a36847f8042cbf,2,2105
3,False,1,21595,Acting,Alyson Hannigan,Alyson Hannigan,25.218,/bO16z8rAzZWdjCga8dcbJ2AFAh2.jpg,19,Michelle Flaherty,52fe4336c3a36847f8042cc3,3,2105
4,False,1,21596,Acting,Shannon Elizabeth,Shannon Elizabeth,15.261,/NttGD7i2OgoMBZPDmpaSfoUUAm.jpg,20,Nadia,52fe4336c3a36847f8042cc7,4,2105
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9060,False,1,90131,Acting,Anne Marie DeLuise,Anne Marie DeLuise,4.978,/xc3k6XR3iExPXW0GZlXzRPOkDSH.jpg,3,Drew McDonald,52fe461e9251416c91049b77,3,36968
9061,False,2,46899,Acting,Garwin Sanford,Garwin Sanford,4.269,/uVhyU0G7LQUHdfRdr2mSmEbzR7B.jpg,4,Richie,52fe461e9251416c91049b7b,4,36968
9062,False,2,85922,Acting,Tom Butler,Tom Butler,7.223,/sMDuFSvna90gGV0jkOK7PDzQC8g.jpg,5,Phil,52fe461e9251416c91049b7f,5,36968
9063,False,1,116571,Acting,Jillian Fargey,Jillian Fargey,2.196,,7,Ellen,52fe461e9251416c91049b87,6,36968
