In [29]:
import pandas as pd
import numpy as np
import sys

import os

from tqdm.notebook import tqdm
import json

In [30]:
sys.path.append('/'.join(os.getcwd().split("/")[:-1])+"/backend/")

In [31]:
from data_collection import MultiThreading

In [32]:
import tmdbsimple as tmdb
tmdb.API_KEY = '03ba24122703f9cff69789b9ebbd342e'

### Main Film Data

In [33]:
def get_movie_payload():
    
    movie_payload = {'keywords':{'movie_response':[],'cols':['id','keywords'],'results_parsed':[]},
                     'reviews':{'movie_response':[],'cols':['id','results'],'results_parsed':[]},
                     'info':{'movie_response':[],'cols':['id','budget', 'revenue','genres','production_countries','tagline'],'results_parsed':[]}}
    
    return movie_payload


In [34]:
class TMDBMovieScraper:
    def __init__(self, years_check: list):
        
        """
        This class uses the tmdbsimple library to query film information from tmdb.com
        The discover api first queries movies that were released in a year and then 
        the movies api queries more detail about films using the tmdb ids queried from the discover api
        
        
        """
        self.discover_api = tmdb.Discover() # instantiate tmdb.Discover module
        self.years_check = years_check # the years to ierate through
        self.discover_results = [] # results for
        self.movie_payload = get_movie_payload() # get an empty movie payload
        
    
    def _check_page_counts(self, discover_api, year: int):
        """
        Args:
            discover_api - instance of tmdb.Discover()
            year - a year to query
        Returns
            total_pages_to_loop - The number of response pages to iterate over
        """
        
        total_pages_to_loop = discover_api.movie(year=year,page=1)['total_pages']
        return total_pages_to_loop
    
    def _request_discover_data(self, years_check: list):
        
        """
        Args:
            years_check - list of years to iterate over
        Returns
            nothing
        """
        
        for year in tqdm(years_check): # for each year
            try:
                # check how many pages to loop over for the specific year
                total_pages_to_loop = self._check_page_counts(self.discover_api, year) 

                for page in tqdm(range(1,total_pages_to_loop)): # for each page in a given year
                    try:
                        # get movie results for that year and that page
                        movie_results = self.discover_api.movie(primary_release_year=year, 
                                                                page=page, 
                                                                with_original_language='en', 
                                                                include_adult=False, 
                                                                vote_count_gte=100)
                        
                        self.discover_results.append(movie_results['results']) # append results to a list
                    except:
                        pass
            except:
                pass
    
    def _request_movie_data(self, movie_ids: list):
        """
        Args:
            movie_ids - list of movie ids
        Returns
            Nothing
        """
        
        for movie_id in tqdm(movie_ids): # iterate over each movie id
            try:
                for k,v in self.movie_payload.items(): # iterate over each key and value in payload
                    v['movie_response'] = self._get_film_responses(movie_id,k) # get film response from api instance
                    
                    v['results_parsed'].append(self._parse_movie_response(v['movie_response'],v['cols']))
            except:
                pass

    
            
    def _transform_discover_results(self, discover_results):
        discover_df = pd.concat(pd.DataFrame(i) for i in discover_results)
        discover_df = discover_df[['id','title','overview','popularity', 'release_date','vote_average','poster_path']]
        discover_df['release_year'] = discover_df['release_date'].apply(lambda x: x.split("-")[0])
        
        return discover_df.set_index('id')
    
    def _transform_movie_results(self):
        results = [pd.DataFrame(self.movie_payload[k]['results_parsed']).set_index('id') for k,v in self.movie_payload.items()]
        self.movie_df = results[0].join(results[1]).join(results[2])
        return self.movie_df
    
    def _get_movie_ids(self, discover_dataframe):
        movie_ids = list(discover_dataframe.index)
        return list(set(movie_ids))
    
    def _get_film_response(self, movie_id):
        film_response = tmdb.Movies(movie_id)
        return film_response
    
    def _get_film_responses(self, movie_id, data_stream='info'):
        film_response = self._get_film_response(movie_id)
        if data_stream == 'reviews':
            return film_response.reviews()
        elif data_stream == 'info':
            return film_response.info()
        elif data_stream == 'keywords':
            return film_response.keywords()
        
    def _parse_movie_response(self, response_dic, cols_needed):
        return {k:v for k,v in response_dic.items() if k in cols_needed}
    

    def _dict_to_list(self, x, key_name):
        try:
            return [i[key_name] for i in x]
        except:
            return x
    
    def _merge_clean_and_filter(self):
        
        
        self.df = self.discover_df.join(self.movie_df)
        
        cols = {'results':'content', 'genres':'name', 'production_countries':'name', 'keywords':'name'}
        
        for k,v in cols.items():
            
            self.df[k] = self.df[k].apply(lambda x: self._dict_to_list(x,v))
            print(k,v)
            
        self.df = self.df.rename(columns={'id':'tmdb_id','title':'movie'})
        
        
        return self.df
    
                
    def get_movies(self):
        mt = MultiThreading(10,self.years_check, None)
        mt.Run(self._request_discover_data)
        
        
        self.discover_df = self._transform_discover_results(self.discover_results)
        self.movie_ids = self._get_movie_ids(self.discover_df)
        mt = MultiThreading(10,self.movie_ids,None)
        mt.Run(self._request_movie_data)
        
        self.movie_df = self._transform_movie_results()
        self._merge_clean_and_filter()
        
        
   

In [47]:
x = TMDBMovieScraper(list(range(1980,1984)))

In [48]:
x.get_movies()

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/257 [00:00<?, ?it/s]

  0%|          | 0/249 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

results content
genres name
production_countries name
keywords name


In [9]:
x.df.to_csv('movies.csv')

In [20]:
pd.read_pickle('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/movies.pickle').head(2)

Unnamed: 0,m_tmdb_id,m_movie,m_overview,m_popularity,m_release_date,m_vote_average,m_poster_path,m_release_year,m_keywords,m_results,m_budget,m_genres,m_production_countries,m_revenue,m_tagline
0,12445,Harry Potter and the Deathly Hallows: Part 2,"Harry, Ron and Hermione continue their quest t...",182.818,2011-07-07,8.1,/c54HpQmuwXjHq2C9wmoACjxoom3.jpg,2011,"['saving the world', 'witch', 'self sacrifice'...","[""It is the quality of one's convictions that ...",125000000.0,"['Fantasy', 'Adventure']","['United Kingdom', 'United States of America']",1341511000.0,It all ends here.
1,49013,Cars 2,Star race car Lightning McQueen and his pal Ma...,169.573,2011-06-11,6.1,/okIz1HyxeVOMzYwwHUjH2pHi74I.jpg,2011,"['car race', 'sequel', 'anthropomorphism', 'be...",['Lasseter is smart enough to tell us a comple...,200000000.0,"['Animation', 'Family', 'Adventure', 'Comedy']",['United States of America'],559852400.0,Ka-ciao!


In [67]:
x = pd.read_csv('movies.csv',index_col=0).reset_index()
x.columns = ['m_'+i for i in x.columns]

In [68]:
x = x.rename(columns = {'m_id':'m_tmdb_id'})
x.to_pickle('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/movies_new.pickle')

### Cast Data

In [54]:
import concurrent.futures

class TMDBCastCrewScraper(TMDBMovieScraper):
    
    def __init__(self, movie_ids: list, popularity_threshold = 1, max_threads=30):
        self.movie_ids = movie_ids
        self.cast_results = []
        self.crew_results = []
        self.popularity_threshold = popularity_threshold
        self.max_threads = max_threads


    def _append_data_to_list(self,list_dic_results, popularity_threshold, results, movie_id):
        for dic in list_dic_results:
            if dic['popularity'] >= self.popularity_threshold:
                dic['tmdb_id'] = movie_id
                results.append(dic)


    def _get_cast_crew_for_movie(self, movie_id):
        try:
            film_response = self._get_film_response(movie_id)
#             crew_dic_list = film_response.credits()['crew']
            cast_dic_list = film_response.credits()['cast']
#             self._append_data_to_list(crew_dic_list,self.popularity_threshold,self.crew_results,movie_id)
            self._append_data_to_list(cast_dic_list,self.popularity_threshold,self.cast_results,movie_id)

        except:
            pass

    def _get_cast_crew(self, movie_ids):
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            futures = []
            for movie_id in movie_ids:
                futures.append(executor.submit(self._get_cast_crew_for_movie, movie_id))
            concurrent.futures.wait(futures)
        return


In [55]:
mod = TMDBCastCrewScraper(x.index)

In [56]:
mod._get_cast_crew(x.index)

In [63]:
df_cast = pd.DataFrame(mod.cast_results)

In [13]:
df_cast.to_csv('cast.csv',index=False)

In [64]:
df_cast.columns = ['c_'+i for i in df_cast.columns]

In [66]:
df_cast.to_pickle('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/cast_new.pickle')

In [65]:
df_cast

Unnamed: 0,c_adult,c_gender,c_id,c_known_for_department,c_name,c_original_name,c_popularity,c_profile_path,c_cast_id,c_character,c_credit_id,c_order,c_tmdb_id
0,False,1,49,Acting,Maria Bello,Maria Bello,24.405,/it1f5mxiGIWO0DzTBfwtLTjphZb.jpg,5,Adèlle,52fe420fc3a36847f8000a93,0,17
1,False,2,48,Acting,Sean Bean,Sean Bean,36.837,/kTjiABk3TJ3yI0Cto5RsvyT6V3o.jpg,4,James,52fe420fc3a36847f8000a8f,1,17
2,False,2,51,Acting,Richard Elfyn,Richard Elfyn,1.349,/vaj3r7Wfezb5j2eVv7dIFiJXyyQ.jpg,6,Rowan,52fe420fc3a36847f8000a97,3,17
3,False,2,54,Acting,Maurice Roëves,Maurice Roëves,2.348,/3WT7GhYloKzgiizP5pahruFbBOD.jpg,7,Dafydd,52fe420fc3a36847f8000a9b,4,17
4,False,1,56,Acting,Sophie Stuckey,Sophie Stuckey,8.263,/47JMpGYirQe8C7neteGzaSVCKjc.jpg,9,Sarah,52fe420fc3a36847f8000aa3,5,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116076,False,2,935612,Acting,Tomasz Sapryk,Tomasz Sapryk,2.260,/9YR9wo2Ri5GQlwwF4bnfoa3JU3j.jpg,26,Sierżant Styś,602e4691db4ed6003e6e6e17,14,10819
116077,False,0,965560,Acting,Jarosław Gruda,Jarosław Gruda,1.788,/zVHgfbE4a3KyYQnA2YWvbw6Ftt1.jpg,31,Wujek Rysiek,602e5111c3bffe004035d23a,19,10819
116078,False,0,136679,Acting,Agata Piotrowska,Agata Piotrowska,1.094,/b08BW9HsDKA6t6frwhuRvpTnDzn.jpg,34,Druhna,602e523a8e2ba6003f16eb2e,22,10819
116079,False,2,2129343,Acting,Zbigniew Paterak,Zbigniew Paterak,1.411,/kOl8rtonHbBAjTmoAfxjL47jP05.jpg,35,Stasiuk,602e525587e63e003f9544bd,23,10819


# Get Tag and Movie Poster
--

In [None]:
def get_film_response(film_id):
    film_response = tmdb.Movies(film_id)
    return film_response

In [None]:
def build_film_dic(film_id,film_response):
    film_dic = {}
    
    try:
        film_dic['tmdb_id'] = film_id
    except:
        film_dic['tmdb_id'] = np.nan
        
    try:
        film_dic['poster_path'] = 'https://image.tmdb.org/t/p/w500'+film_response.info()['poster_path']  
    except:
        film_dic['poster_path'] = np.nan
    
    try:
        film_dic['tagline'] = film_response.info()['tagline']   
    except:
        film_dic['tagline'] = np.nan
    
   
    
    return film_dic
    


In [None]:
all_results = []

In [None]:
def get_film_details(film_ids):
    
    for film_id in tqdm(film_ids):
      

        film_response = get_film_response(film_id)
        film_dic = build_film_dic(film_id,film_response)
        all_results.append(film_dic)

            
    return results

In [None]:
x = pd.read_json('/Users/sam.ho/Documents/sam_personal/streamlit_apps/imdb_network_analysis/data/main/tmdb_data_main.json')

In [None]:
x['tmdb_id'].to_list()

In [None]:
mt = MultiThreading(20,x['tmdb_id'].to_list(),all_results)

In [None]:
mt.Run(get_film_details)

In [None]:
y = pd.DataFrame(all_results)

In [None]:
df_use = pd.merge(x,y,how='left')

In [None]:
df_use.to_json('/Users/sam.ho/Documents/sam_personal/streamlit_apps/imdb_network_analysis/data/main/tmdb_data_main.json')

# New Class - check

In [None]:
import tmdb.simple as tmdb
import pandas as pd
from tqdm import tqdm


class TMDBMovieScraper:
    def __init__(self, years_check: list):
        """
        Initializes the TMDBMovieScraper class with a list of years to iterate over for querying movie data from TMDB.
        
        Args:
            years_check (list): List of years to iterate over for querying movie data.
        """
        self.discover_api = tmdb.Discover()  # Instantiate tmdb.Discover module
        self.years_check = years_check  # List of years to iterate over
        self.discover_results = []  # Results for movie discovery
        self.movie_payload = self.get_movie_payload()  # Get an empty movie payload

    def _check_page_counts(self, discover_api, year: int):
        """
        Helper method to check the number of response pages to iterate over for a specific year.
        
        Args:
            discover_api (tmdb.Discover): Instance of tmdb.Discover module.
            year (int): Year to query.
        
        Returns:
            total_pages_to_loop (int): Number of response pages to iterate over.
        """
        total_pages_to_loop = discover_api.movie(year=year, page=1)['total_pages']
        return total_pages_to_loop

    def _request_discover_data(self, years_check: list):
        """
        Helper method to query movie data using the discover API.
        
        Args:
            years_check (list): List of years to iterate over for querying movie data.
        """
        for year in tqdm(years_check):  # For each year
            try:
                total_pages_to_loop = self._check_page_counts(self.discover_api, year)  # Check number of pages

                for page in tqdm(range(1, total_pages_to_loop)):  # For each page in a given year
                    try:
                        movie_results = self.discover_api.movie(primary_release_year=year,
                                                                page=page,
                                                                with_original_language='en',
                                                                include_adult=False,
                                                                vote_count_gte=100)

                        self.discover_results.append(movie_results['results'])  # Append results to a list
                    except:
                        pass
            except:
                pass

    def _request_movie_data(self, movie_ids: list):
        """
        Helper method to query movie details using movie IDs.
        
        Args:
            movie_ids (list): List of movie IDs.
        """
        for movie_id in tqdm(movie_ids):  # Iterate over each movie ID
            try:
                for k, v in self.movie_payload.items():  # Iterate over each key and value in payload
                    v['movie_response'] = self._get_film_responses(movie_id, k)  # Get film response from API instance

                    v['results_parsed'].append(self._parse_movie_response(v['movie_response'], v['cols']))
            except:
                pass

    def _transform_discover_results(self, discover_results):
        """
        Helper method to transform discover results into a DataFrame.
        
        Args:
            discover_results: Results from movie discovery.
        
        Returns:
            discover_df (pd.DataFrame): Transformed DataFrame of discover results.
        """
        discover_df = pd.concat(pd.DataFrame(i) for i in discover_results)
        discover_df = discover_df[['id', 'title', 'overview', 'popularity', 'release_date', 'vote_average', 'poster_path']]
        discover_df['release_year'] = discover_df['release_date'].apply(lambda x: x.split("-")[0])
        
        return discover_df.set_index('id')

    def _transform_movie_results(self):
        """
        Helper method to transform movie results into a DataFrame.
        
        Returns:
            movie_df (pd.DataFrame): Transformed DataFrame of movie results.
        """
        results = [pd.DataFrame(self.movie_payload[k]['results_parsed']).set_index('id') for k, v in self.movie_payload.items()]
        movie_df = results[0].join(results[1]).join(results[2])
        return movie_df

    def _get_movie_ids(self, discover_dataframe):
        """
        Helper method to retrieve movie IDs from the discover DataFrame.
        
        Args:
            discover_dataframe (pd.DataFrame): DataFrame of discover results.
        
        Returns:
            movie_ids (list): List of movie IDs.
        """
        movie_ids = list(discover_dataframe.index)
        return list(set(movie_ids))

    def _get_film_response(self, movie_id):
        """
        Helper method to get film response from TMDB API.
        
        Args:
            movie_id: ID of the movie.
        
        Returns:
            film_response: Film response from TMDB API.
        """
        film_response = tmdb.Movies(movie_id)
        return film_response

    def _get_film_responses(self, movie_id, data_stream='info'):
        """
        Helper method to get film responses based on data stream type.
        
        Args:
            movie_id: ID of the movie.
            data_stream (str): Data stream type.
        
        Returns:
            film_response: Film response based on the specified data stream.
        """
        film_response = self._get_film_response(movie_id)
        if data_stream == 'reviews':
            return film_response.reviews()
        elif data_stream == 'info':
            return film_response.info()
        elif data_stream == 'keywords':
            return film_response.keywords()

    def _parse_movie_response(self, response_dic, cols_needed):
        """
        Helper method to parse movie response based on the required columns.
        
        Args:
            response_dic: Movie response dictionary.
            cols_needed (list): Required columns.
        
        Returns:
            parsed_response (dict): Parsed movie response containing the required columns.
        """
        parsed_response = {k: v for k, v in response_dic.items() if k in cols_needed}
        return parsed_response

    def _dict_to_list(self, x, key_name):
        """
        Helper method to convert a dictionary to a list based on a specific key.
        
        Args:
            x: Input dictionary.
            key_name (str): Key name.
        
        Returns:
            converted_list: List converted from the dictionary.
        """
        try:
            return [i[key_name] for i in x]
        except:
            return x

    def _merge_clean_and_filter(self):
        """
        Helper method to merge, clean, and filter the movie DataFrame.
        
        Returns:
            df (pd.DataFrame): Merged, cleaned, and filtered DataFrame.
        """
        self.df = self.discover_df.join(self.movie_df)
        
        cols = {'results': 'content', 'genres': 'name', 'production_countries': 'name', 'keywords': 'name'}
        
        for k, v in cols.items():
            self.df[k] = self.df[k].apply(lambda x: self._dict_to_list(x, v))
            print(k, v)
            
        self.df = self.df.rename(columns={'id': 'tmdb_id', 'title': 'movie'})
        
        return self.df

    def get_movies(self):
        """
        Method to initiate the data retrieval process.
        """
        mt = MultiThreading(10, self.years_check, None)
        mt.Run(self._request_discover_data)
        
        self.discover_df = self._
