In [1]:
import pandas as pd
import numpy as np
import sys

import os

from tqdm.notebook import tqdm
import json

In [2]:
sys.path.append('/'.join(os.getcwd().split("/")[:-1])+"/backend/")

In [3]:
from data_collection import MultiThreading

In [4]:
import tmdbsimple as tmdb
tmdb.API_KEY = '03ba24122703f9cff69789b9ebbd342e'

### Main Film Data

In [12]:
def get_movie_payload():
    
    movie_payload = {'keywords':{'movie_response':[],'cols':['id','keywords'],'results_parsed':[]},
                     'reviews':{'movie_response':[],'cols':['id','results'],'results_parsed':[]},
                     'info':{'movie_response':[],'cols':['id','budget', 'revenue','genres','production_countries','tagline'],'results_parsed':[]}}
    
    return movie_payload


In [5]:
class TMDBMovieScraper:
    def __init__(self, years_check: list):
        
        """
        This class uses the tmdbsimple library to query film information from tmdb.com
        The discover api first queries movies that were released in a year and then 
        the movies api queries more detail about films using the tmdb ids queried from the discover api
        
        
        """
        self.discover_api = tmdb.Discover() # instantiate tmdb.Discover module
        self.years_check = years_check # the years to ierate through
        self.discover_results = [] # results for
        self.movie_payload = get_movie_payload() # get an empty movie payload
        
    
    def _check_page_counts(self, discover_api, year: int):
        """
        Args:
            discover_api - instance of tmdb.Discover()
            year - a year to query
        Returns
            total_pages_to_loop - The number of response pages to iterate over
        """
        
        total_pages_to_loop = discover_api.movie(year=year,page=1)['total_pages']
        return total_pages_to_loop
    
    def _request_discover_data(self, years_check: list):
        
        """
        Args:
            years_check - list of years to iterate over
        Returns
            nothing
        """
        
        for year in tqdm(years_check): # for each year
            try:
                # check how many pages to loop over for the specific year
                total_pages_to_loop = self._check_page_counts(self.discover_api, year) 

                for page in tqdm(range(1,total_pages_to_loop)): # for each page in a given year
                    try:
                        # get movie results for that year and that page
                        movie_results = self.discover_api.movie(primary_release_year=year, 
                                                                page=page, 
                                                                with_original_language='en', 
                                                                include_adult=False, 
                                                                vote_count_gte=100)
                        
                        self.discover_results.append(movie_results['results']) # append results to a list
                    except:
                        pass
            except:
                pass
    
    def _request_movie_data(self, movie_ids: list):
        """
        Args:
            movie_ids - list of movie ids
        Returns
            Nothing
        """
        
        for movie_id in tqdm(movie_ids): # iterate over each movie id
            try:
                for k,v in self.movie_payload.items(): # iterate over each key and value in payload
                    v['movie_response'] = self._get_film_responses(movie_id,k) # get film response from api instance
                    
                    v['results_parsed'].append(self._parse_movie_response(v['movie_response'],v['cols']))
            except:
                pass

    
            
    def _transform_discover_results(self, discover_results):
        discover_df = pd.concat(pd.DataFrame(i) for i in discover_results)
        discover_df = discover_df[['id','title','overview','popularity', 'release_date','vote_average','poster_path']]
        discover_df['release_year'] = discover_df['release_date'].apply(lambda x: x.split("-")[0])
        
        return discover_df.set_index('id')
    
    def _transform_movie_results(self):
        results = [pd.DataFrame(self.movie_payload[k]['results_parsed']).set_index('id') for k,v in self.movie_payload.items()]
        self.movie_df = results[0].join(results[1]).join(results[2])
        return self.movie_df
    
    def _get_movie_ids(self, discover_dataframe):
        movie_ids = list(discover_dataframe.index)
        return list(set(movie_ids))
    
    def _get_film_response(self, movie_id):
        film_response = tmdb.Movies(movie_id)
        return film_response
    
    def _get_film_responses(self, movie_id, data_stream='info'):
        film_response = self._get_film_response(movie_id)
        if data_stream == 'reviews':
            return film_response.reviews()
        elif data_stream == 'info':
            return film_response.info()
        elif data_stream == 'keywords':
            return film_response.keywords()
        
    def _parse_movie_response(self, response_dic, cols_needed):
        return {k:v for k,v in response_dic.items() if k in cols_needed}
    

    def _dict_to_list(self, x, key_name):
        try:
            return [i[key_name] for i in x]
        except:
            return x
    
    def _merge_clean_and_filter(self):
        
        
        self.df = self.discover_df.join(self.movie_df)
        
        cols = {'results':'content', 'genres':'name', 'production_countries':'name', 'keywords':'name'}
        
        for k,v in cols.items():
            
            self.df[k] = self.df[k].apply(lambda x: self._dict_to_list(x,v))
            print(k,v)
            
        self.df = self.df.rename(columns={'id':'tmdb_id','title':'movie'})
        
        
        return self.df
    
                
    def get_movies(self):
        mt = MultiThreading(10,self.years_check, None)
        mt.Run(self._request_discover_data)
        
        
        self.discover_df = self._transform_discover_results(self.discover_results)
        self.movie_ids = self._get_movie_ids(self.discover_df)
        mt = MultiThreading(10,self.movie_ids,None)
        mt.Run(self._request_movie_data)
        
        self.movie_df = self._transform_movie_results()
        self._merge_clean_and_filter()
        
        
   

In [27]:
x = TMDBMovieScraper(list(range(1990,1991)))

In [28]:
x.get_movies()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/311 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

results content
genres name
production_countries name
keywords name


In [12]:
x.df

In [13]:
x.df.to_csv('movies.csv')

In [6]:
x = pd.read_csv('movies.csv',index_col=0)
x

Unnamed: 0_level_0,movie,overview,popularity,release_date,vote_average,poster_path,release_year,keywords,results,budget,genres,production_countries,revenue,tagline
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12445,Harry Potter and the Deathly Hallows: Part 2,"Harry, Ron and Hermione continue their quest t...",182.818,2011-07-07,8.1,/c54HpQmuwXjHq2C9wmoACjxoom3.jpg,2011,"['saving the world', 'witch', 'self sacrifice'...","[""It is the quality of one's convictions that ...",125000000.0,"['Fantasy', 'Adventure']","['United Kingdom', 'United States of America']",1.341511e+09,It all ends here.
49013,Cars 2,Star race car Lightning McQueen and his pal Ma...,169.573,2011-06-11,6.1,/okIz1HyxeVOMzYwwHUjH2pHi74I.jpg,2011,"['car race', 'sequel', 'anthropomorphism', 'be...",['Lasseter is smart enough to tell us a comple...,200000000.0,"['Animation', 'Family', 'Adventure', 'Comedy']",['United States of America'],5.598524e+08,Ka-ciao!
50014,The Help,Aibileen Clark is a middle-aged African-Americ...,168.317,2011-08-09,8.2,/3kmfoWWEc9Vtyuaf9v5VipRgdjx.jpg,2011,"['mississippi river', 'based on novel or book'...","['Great setting, cast, story and performances....",25000000.0,['Drama'],['United States of America'],2.166391e+08,Change begins with a whisper.
1865,Pirates of the Caribbean: On Stranger Tides,Captain Jack Sparrow crosses paths with a woma...,160.529,2011-05-14,6.5,/keGfSvCmYj7CvdRx36OdVrAEibE.jpg,2011,"['england', 'spain', 'sea', 'captain', 'mutiny...",['More of the same ... but it is not funny any...,380000000.0,"['Adventure', 'Action', 'Fantasy']","['United Kingdom', 'United States of America']",1.045714e+09,Live Forever Or Die Trying.
39254,Real Steel,Charlie Kenton is a washed-up fighter who reti...,130.738,2011-09-28,7.0,/4GIeI5K5YdDUkR3mNQBoScpSFEf.jpg,2011,"['parent child relationship', 'fight', 'future...","['No splitting this Atom, it has got a rock so...",110000000.0,"['Action', 'Science Fiction', 'Drama']","['United States of America', 'India']",2.992685e+08,"If you get one shot, make it real."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818612,Wedding Season,Pressured by their immigrant parents to find s...,11.387,2022-08-04,6.3,/mFeXAZ1oOECPqEu8c2i4L5LmNyY.jpg,2022,[],[],0.0,"['Romance', 'Comedy']","['Canada', 'United States of America']",0.000000e+00,
973164,Ricky Gervais: SuperNature,"With his signature pitch-black sense of humor,...",10.436,2022-05-24,7.3,/ry2HDP5KyEV3R7htXlFAeQbbXgf.jpg,2022,['stand-up comedy'],[],0.0,['Comedy'],['United Kingdom'],0.000000e+00,
957258,What is a Woman?,Political commentator Matt Walsh explores the ...,10.299,2022-06-01,6.9,/iiP8Sq7yWoYsKx9EVfyRPno8Un9.jpg,2022,"['womanhood', 'gender', 'social documentary', ...",[],0.0,['Documentary'],['United States of America'],0.000000e+00,One man's journey to answer the question of a ...
971961,Gabriel's Rapture: Part III,In the sixth installment of the Gabriel's Infe...,9.950,2022-08-12,8.1,/vyLRyHRrPy9zTf6t0sS4aTbF9gl.jpg,2022,[],[],0.0,['Romance'],[],0.000000e+00,


### Cast Data

In [45]:
import concurrent.futures

class TMDBCastCrewScraper(TMDBMovieScraper):
    
    def __init__(self, movie_ids: list, popularity_threshold = 25, max_threads=30):
        self.movie_ids = movie_ids
        self.cast_results = []
        self.crew_results = []
        self.popularity_threshold = popularity_threshold
        self.max_threads = max_threads


    def _append_data_to_list(self,list_dic_results, popularity_threshold, results, movie_id):
        for dic in list_dic_results:
            if dic['popularity'] >= self.popularity_threshold:
                dic['tmdb_id'] = movie_id
                results.append(dic)


    def _get_cast_crew_for_movie(self, movie_id):
        try:
            film_response = self._get_film_response(movie_id)
#             crew_dic_list = film_response.credits()['crew']
            cast_dic_list = film_response.credits()['cast']
#             self._append_data_to_list(crew_dic_list,self.popularity_threshold,self.crew_results,movie_id)
            self._append_data_to_list(cast_dic_list,self.popularity_threshold,self.cast_results,movie_id)

        except:
            pass

    def _get_cast_crew(self, movie_ids):
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            futures = []
            for movie_id in movie_ids:
                futures.append(executor.submit(self._get_cast_crew_for_movie, movie_id))
            concurrent.futures.wait(futures)
        return


In [46]:
mod = TMDBCastCrewScraper(x.index)

In [47]:
mod._get_cast_crew(x.index)

In [48]:
df_cast = pd.DataFrame(mod.cast_results)

In [55]:
df_cast.to_csv('cast.csv',index=False)

# Get Tag and Movie Poster
--

In [None]:
def get_film_response(film_id):
    film_response = tmdb.Movies(film_id)
    return film_response

In [None]:
def build_film_dic(film_id,film_response):
    film_dic = {}
    
    try:
        film_dic['tmdb_id'] = film_id
    except:
        film_dic['tmdb_id'] = np.nan
        
    try:
        film_dic['poster_path'] = 'https://image.tmdb.org/t/p/w500'+film_response.info()['poster_path']  
    except:
        film_dic['poster_path'] = np.nan
    
    try:
        film_dic['tagline'] = film_response.info()['tagline']   
    except:
        film_dic['tagline'] = np.nan
    
   
    
    return film_dic
    


In [None]:
all_results = []

In [None]:
def get_film_details(film_ids):
    
    for film_id in tqdm(film_ids):
      

        film_response = get_film_response(film_id)
        film_dic = build_film_dic(film_id,film_response)
        all_results.append(film_dic)

            
    return results

In [None]:
x = pd.read_json('/Users/sam.ho/Documents/sam_personal/streamlit_apps/imdb_network_analysis/data/main/tmdb_data_main.json')

In [None]:
x['tmdb_id'].to_list()

In [None]:
mt = MultiThreading(20,x['tmdb_id'].to_list(),all_results)

In [None]:
mt.Run(get_film_details)

In [None]:
y = pd.DataFrame(all_results)

In [None]:
df_use = pd.merge(x,y,how='left')

In [None]:
df_use.to_json('/Users/sam.ho/Documents/sam_personal/streamlit_apps/imdb_network_analysis/data/main/tmdb_data_main.json')