In [1]:
!pip install -q tmdbv3api

In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

In [3]:
links = ["https://en.wikipedia.org/wiki/List_of_American_films_of_2018",
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2019',
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2020',
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2021',
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2022',
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2023',
        'https://en.wikipedia.org/wiki/List_of_American_films_of_2024']

df_list = [pd.read_html(link, header=0)[i] for i in range(2,6) for link in links]
df = pd.concat(df_list, ignore_index=True)
df = df[['Title', 'Cast and crew']]
df

Unnamed: 0,Title,Cast and crew
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...
1,The Strange Ones,Christopher Radcliff (director/screenplay); La...
2,The Commuter,Jaume Collet-Serra (director); Byron Willinger...
3,Proud Mary,"Babak Najafi (director); John S. Newman, Chris..."
4,Acts of Violence,Brett Donowho (director); Nicolas Aaron Mezzan...
...,...,...
2268,Nosferatu,Robert Eggers (director/screenplay); Bill Skar...
2269,A Complete Unknown,James Mangold (director/screenplay); Jay Cocks...
2270,The Fire Inside,Rachel Morrison (director); Barry Jenkins (scr...
2271,Babygirl,Halina Reijn (director/screenplay); Nicole Kid...


In [6]:
import json
import requests
from tmdbv3api import Movie

api_key='YOUR_API_KEY'
tmdb_movie = Movie()
tmdb_movie.api_key=api_key
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if len(result['results'])==0:
        return np.NaN

    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,api_key))
    data_json = response.json()
    if data_json['genres']:
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return genres
    else:
        []

df.loc[:, 'genres'] = df['Title'].progress_apply(lambda x: get_genre(str(x)))
df['genres']

100%|██████████| 2273/2273 [05:55<00:00,  6.39it/s]


Unnamed: 0,genres
0,"[Horror, Thriller]"
1,"[Drama, Mystery]"
2,"[Action, Thriller, Mystery]"
3,"[Thriller, Action, Crime]"
4,"[Action, Crime, Thriller]"
...,...
2268,"[Drama, Fantasy, Horror]"
2269,"[Drama, Music, History]"
2270,[Drama]
2271,[Drama]


In [7]:
df=df.dropna(how='any')

In [8]:
import re

def get_director(x):
    pattern = r'([A-Za-z\s-]+)\s*\(director[^\)]*\)'
    directors = re.findall(pattern, x)
    return directors

df.loc[:, 'director'] = df['Cast and crew'].progress_apply(lambda x: get_director(x))
df['director']

100%|██████████| 2261/2261 [00:00<00:00, 20656.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'director'] = df['Cast and crew'].progress_apply(lambda x: get_director(x))


Unnamed: 0,director
0,[Adam Robitel ]
1,"[Christopher Radcliff , Lauren Wolkstein ]"
2,[Jaume Collet-Serra ]
3,[Babak Najafi ]
4,[Brett Donowho ]
...,...
2267,[Ben Smallbone ]
2268,[Robert Eggers ]
2269,[James Mangold ]
2270,[Rachel Morrison ]


In [9]:
def get_actors(x):
     actors = x.split("); ")[-1].split(", ")
     return actors[:3]

df.loc[:, 'actors'] = df['Cast and crew'].progress_apply(lambda x: get_actors(x))
df['actors']

100%|██████████| 2261/2261 [00:00<00:00, 244465.90it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'actors'] = df['Cast and crew'].progress_apply(lambda x: get_actors(x))


Unnamed: 0,actors
0,"[Lin Shaye, Angus Sampson, Leigh Whannell]"
1,"[Alex Pettyfer, James Freedson-Jackson, Emily ..."
2,"[Liam Neeson, Vera Farmiga, Patrick Wilson]"
3,"[Taraji P. Henson, Jahi Di'Allo Winston, Billy..."
4,"[Bruce Willis, Cole Hauser, Shawn Ashmore]"
...,...
2267,"[Neal McDonough, Dawn Olivieri, Currie Graham]"
2268,"[Bill Skarsgård, Nicholas Hoult, Lily-Rose Depp]"
2269,"[Timothée Chalamet, Edward Norton, Elle Fanning]"
2270,"[Ryan Destiny, Brian Tyree Henry, Judy Greer]"


In [10]:
df['Title']=df['Title'].str.lower()
df=df[['Title','genres','director','actors']]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title']=df['Title'].str.lower()


Unnamed: 0,Title,genres,director,actors
0,insidious: the last key,"[Horror, Thriller]",[Adam Robitel ],"[Lin Shaye, Angus Sampson, Leigh Whannell]"
1,the strange ones,"[Drama, Mystery]","[Christopher Radcliff , Lauren Wolkstein ]","[Alex Pettyfer, James Freedson-Jackson, Emily ..."
2,the commuter,"[Action, Thriller, Mystery]",[Jaume Collet-Serra ],"[Liam Neeson, Vera Farmiga, Patrick Wilson]"
3,proud mary,"[Thriller, Action, Crime]",[Babak Najafi ],"[Taraji P. Henson, Jahi Di'Allo Winston, Billy..."
4,acts of violence,"[Action, Crime, Thriller]",[Brett Donowho ],"[Bruce Willis, Cole Hauser, Shawn Ashmore]"
...,...,...,...,...
2267,homestead,"[Action, Drama, Thriller]",[Ben Smallbone ],"[Neal McDonough, Dawn Olivieri, Currie Graham]"
2268,nosferatu,"[Drama, Fantasy, Horror]",[Robert Eggers ],"[Bill Skarsgård, Nicholas Hoult, Lily-Rose Depp]"
2269,a complete unknown,"[Drama, Music, History]",[James Mangold ],"[Timothée Chalamet, Edward Norton, Elle Fanning]"
2270,the fire inside,[Drama],[Rachel Morrison ],"[Ryan Destiny, Brian Tyree Henry, Judy Greer]"


In [11]:
df.to_csv('processed_wiki.csv',index=False)