In [None]:
import pandas as pd
import requests
import re
import time
api_key = ''  # this is the api key of TMDB Website

#  **Extracting MoviesData**

In [None]:
def get_movies(url,index):
  # read_html will extract the tables from the given link and given index location
  df1 = pd.read_html(url,header=0)[index]
  df2 = pd.read_html(url,header=0)[index+1]
  df3 = pd.read_html(url,header=0)[index+2]
  df4 =pd.read_html(url,header=0)[index+3]

  # concatenating the dataframes (concatenating movies from JAN - DEC)
  return pd.concat([df1,df2,df3,df4],ignore_index=True)[['Title','Cast and crew']]

In [None]:
movies_2022 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2022#',3)

In [None]:
movies_2021 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2021#',2)

In [None]:
movies_2020 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2020#',2)

In [None]:
movies_2019 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2019#',2)

In [None]:
movies_2018 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2018#',2)

In [None]:
movies_2017 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2017#',2)

In [None]:
movies_2016 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2016#',2)

In [None]:
movies_2015 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2015#',2)

In [None]:
movies_2014 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2014#',2)

In [None]:
movies_2013 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2013#',2)

In [None]:
movies_2012 = get_movies('https://en.wikipedia.org/wiki/List_of_American_films_of_2012#',2)

In [None]:
# making list of all the movies dataset for concatenation
lst_movies = [movies_2022,movies_2021,movies_2020,movies_2019,movies_2018,movies_2017,movies_2016,movies_2015,movies_2014,movies_2013,movies_2012]
all_movies = pd.concat(lst_movies,ignore_index=True)

In [None]:
all_movies.head()

Unnamed: 0,Title,Cast and crew
0,Morbius,"Daniel Espinosa (director); Matt Sazama, Burk ..."
1,The Bubble,Judd Apatow (director/screenplay); Pam Brady (...
2,The Contractor,Tarik Saleh (director); J. P. Davis (screenpla...
3,Better Nate Than Ever,"Tim Federle (director/screenplay); Rueby Wood,..."
4,Apollo 10 1⁄2: A Space Age Childhood,Richard Linklater (director/screenplay); Glen ...


In [None]:
# checking no. of rows and columns
all_movies.shape

(2865, 2)

In [None]:
# checking null values
all_movies.isnull().sum()

Title            13
Cast and crew    13
dtype: int64

In [None]:
# removing null values
all_movies.dropna(inplace=True)

In [None]:
all_movies.shape

(2852, 2)

In [None]:
# checking duplicated rows
all_movies.duplicated(keep=False).sum()

2

In [None]:
# removing duplicate rows
all_movies = all_movies.drop_duplicates(keep=False).reset_index(drop=True)

In [None]:
all_movies.shape

(2850, 2)

In [None]:
# checking movies with same name
all_movies['Title'].duplicated(keep=False).sum()

22

In [None]:
# removing movies with same name...to avoid amiguity at time of data fetching
all_movies = all_movies.drop_duplicates(keep=False,subset=['Title']).reset_index(drop=True)

In [None]:
all_movies.shape

(2828, 2)

### Now extracing other details of movies through TMDB API

In [None]:
!pip install tmdbv3api
# install tmdb3api before using it

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tmdbv3api
  Downloading tmdbv3api-1.7.7-py2.py3-none-any.whl (18 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.7.7


In [None]:
# Fetching TMDB ID of movie with it's name
from tmdbv3api import TMDb
tmdb = TMDb()
tmdb.api_key = api_key
from tmdbv3api import Movie
movie_details = Movie()

In [None]:
def get_movie_id(movie_name):
    if(movie_details.search(movie_name)):
        for i in movie_details.search(movie_name):
          if(i['original_language']=='en'):
              return i['id']
    else:
      return -1

In [None]:
from tqdm import tqdm
tqdm.pandas()
all_movies['id'] = all_movies['Title'].progress_apply(get_movie_id)

100%|██████████| 2828/2828 [07:26<00:00,  6.34it/s]


In [None]:
# removing movies whose TMDB ID is -1
all_movies = all_movies[all_movies['id']!=-1].reset_index(drop=True)

In [None]:
# number of movies with same TMDB ID
all_movies['id'].duplicated(keep=False).sum()

73

In [None]:
# removing movies with same TMDB ID
all_movies = all_movies.drop_duplicates(keep=False,subset=['id']).reset_index(drop=True)

In [None]:
all_movies.shape

(2743, 3)

#  Get details of movie with movie id


In [None]:
def get_other_details(movie_id):
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}&language=en-US'.format(movie_id,api_key)).json()
    res_df = pd.json_normalize(response)
    return res_df

In [None]:
other_details = pd.DataFrame()

In [None]:
for i in tqdm(all_movies['id']):
   other_details = pd.concat([other_details, get_other_details(i)])

100%|██████████| 2743/2743 [07:30<00:00,  6.09it/s]


In [None]:
other_details.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'belongs_to_collection.id',
       'belongs_to_collection.name', 'belongs_to_collection.poster_path',
       'belongs_to_collection.backdrop_path'],
      dtype='object')

In [None]:
other_details.drop(['adult', 'backdrop_path', 'belongs_to_collection',
                    'budget','homepage','production_companies',
       'production_countries', 'revenue','spoken_languages', 'status',
       'tagline','original_title',
       'video','belongs_to_collection.id',
       'belongs_to_collection.name', 'belongs_to_collection.poster_path',
       'belongs_to_collection.backdrop_path'
],axis=1,inplace=True)

In [None]:
other_details.shape

(2743, 12)

In [None]:
all_movies.shape

(2743, 3)

In [None]:
all_movies = all_movies.merge(other_details,on='id').dropna().reset_index(drop=True)

In [None]:
all_movies.shape

(2732, 14)

## Extracting Director and Cast names from 'Cast and crew' column

In [None]:
pattern1 = "(.+)\(director"

def get_director(text):
   director = re.findall(pattern1,text)
   return " ".join(director)

In [None]:
all_movies['Director'] = all_movies['Cast and crew'].apply(get_director)

In [None]:
all_movies['Cast']  = all_movies['Cast and crew'].apply(lambda x:x.split(';')[-1].strip())

In [None]:
# droping 'cast and crew' feature ..because it is of no use now
all_movies = all_movies.drop(['Cast and crew'],axis=1)

In [None]:
all_movies.shape

(2732, 15)

In [None]:
# To easily concatenate with the Bollywood movies dataset, I need to shift the positions of "director" and "cast".

In [None]:
lst_cls = list(all_movies.columns)

In [None]:
lst_cls

['Title',
 'id',
 'genres',
 'imdb_id',
 'original_language',
 'overview',
 'popularity',
 'poster_path',
 'release_date',
 'runtime',
 'title',
 'vote_average',
 'vote_count',
 'Director',
 'Cast']

In [None]:
x = lst_cls.pop()
lst_cls.insert(1,x)

In [None]:
x = lst_cls.pop()
lst_cls.insert(1,x)

In [None]:
all_movies = all_movies[lst_cls]

## There are some missing values which can not be caugth by the .isnull() method

### *That's why we have to check manually...if some feature(object type) contain empty Strings*

In [None]:
all_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2848 entries, 0 to 2847
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              2848 non-null   object 
 1   Director           2848 non-null   object 
 2   Cast               2848 non-null   object 
 3   id                 2848 non-null   int64  
 4   genres             2848 non-null   object 
 5   imdb_id            2848 non-null   object 
 6   original_language  2848 non-null   object 
 7   overview           2848 non-null   object 
 8   popularity         2848 non-null   float64
 9   poster_path        2848 non-null   object 
 10  release_date       2848 non-null   object 
 11  runtime            2848 non-null   int64  
 12  title              2848 non-null   object 
 13  vote_average       2848 non-null   float64
 14  vote_count         2848 non-null   int64  
dtypes: float64(2), int64(3), object(10)
memory usage: 333.9+ KB


In [None]:
(all_movies['Director']=='').sum()

3

In [None]:
all_movies = all_movies[all_movies['Director']!=''].reset_index(drop=True)

In [None]:
(all_movies['Cast']=='').sum()

4

In [None]:
all_movies = all_movies[all_movies['Cast']!=''].reset_index(drop=True)

# ***HollyWood DataSet is Ready***

In [None]:
all_movies.to_csv("HollyWood_Movies1.csv")

In [None]:
all_movies.shape

(2725, 15)