In [1]:
import numpy as np
import pandas as pd

In [2]:
# create empty dataframe
main_df = pd.DataFrame()

# we need to calculate from year 2017-present. upto 2016 data is already present
for year in range(2017, 2023):
    link = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' + str(year)
    
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    
    # append all these
    df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
    
    # append the final dataframe to the main dataframe
    main_df = main_df.append(df, ignore_index=True)

In [3]:
# let's keep only the title and cast-crew columns
main_df = main_df[['Title', 'Cast and crew']]
main_df = main_df.dropna(how='any')
main_df

Unnamed: 0,Title,Cast and crew
0,Underworld: Blood Wars,Anna Foerster (director); Cory Goodman (screen...
1,Arsenal,Steven C. Miller (director); Jason Mosberg (sc...
2,Between Us,Rafael Palacio Illingworth (director/screenpla...
3,Monster Trucks,Chris Wedge (director); Derek Connolly (screen...
4,The Bye Bye Man,Stacy Title (director); Jonathan Penner (scree...
...,...,...
1434,"Are You There God? It's Me, Margaret",Kelly Fremon Craig (director/screenplay); Abby...
1435,Distant,Josh Gordon & Will Speck (directors); Spenser ...
1436,Don't Worry Darling,"Olivia Wilde (director); Katie Silberman, Care..."
1437,Puss in Boots: The Last Wish,"Joel Crawford (director); Antonio Banderas, Sa..."


In [4]:
# get genre, original_language and overview
from tmdbv3api import Movie, TMDb
import requests
import json

tmdb = TMDb()
tmdb.api_key = '8cb85690b8c1dfe9317de8b51bafc493'
tmdb_movie = Movie()

def get_respose_json(x):
    result = tmdb_movie.search(x)
    if result == []:
        return {}
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, tmdb.api_key))
    data_json = response.json()
    return data_json

def get_genre(x):
    data_json = get_respose_json(x)
    genres = []
    if "genres" in data_json:
        genre_str = " "
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return genre_str.join(genres)
    else:
        return np.NaN

def get_overview(x):
    data_json = get_respose_json(x)
    return data_json.get('overview')

def get_original_lang(x):
    data_json = get_respose_json(x)
    return data_json.get('original_language')

def get_id(x):
    data_json = get_respose_json(x)
    return data_json.get('id')


main_df['genres'] = main_df['Title'].map(lambda x: get_genre(str(x)))
main_df['overview'] = main_df['Title'].map(lambda x: get_overview(str(x)))
main_df['original_language'] = main_df['Title'].map(lambda x: get_original_lang(str(x)))
main_df['movie_id'] = main_df['Title'].map(lambda x: get_id(x))

In [5]:
temp = main_df.head(500)
temp

Unnamed: 0,Title,Cast and crew,genres,overview,original_language,movie_id
0,Underworld: Blood Wars,Anna Foerster (director); Cory Goodman (screen...,Fantasy Action Thriller Horror,Vampire death dealer Selene fends off brutal a...,en,346672.0
1,Arsenal,Steven C. Miller (director); Jason Mosberg (sc...,Thriller,After the deadbeat brother of a businessman is...,en,388202.0
2,Between Us,Rafael Palacio Illingworth (director/screenpla...,Romance Adventure Drama,"Stranded after a tragic plane crash, two stran...",en,290512.0
3,Monster Trucks,Chris Wedge (director); Derek Connolly (screen...,Action Comedy Science Fiction,Tripp is a high school senior with a knack for...,en,262841.0
4,The Bye Bye Man,Stacy Title (director); Jonathan Penner (scree...,Horror Thriller,When three college students move into an old h...,en,292280.0
...,...,...,...,...,...,...
495,The Possession of Hannah Grace,Diederik van Rooijen (director); Brian Sieve (...,Horror Drama,When a cop who is just out of rehab takes the ...,en,434555.0
496,Anna and the Apocalypse,"John McPhail (director); Alan McDonald, Ryan M...",Horror Comedy Fantasy,A zombie apocalypse threatens the sleepy town ...,en,461928.0
497,Capernaum,Nadine Labaki (director/screenplay); Zain Al R...,Drama,"Zain, a 12-year-old boy scrambling to survive ...",ar,517814.0
498,Mary Queen of Scots,Josie Rourke (director); Beau Willimon (screen...,Drama History,"In 1561, Mary Stuart, widow of the King of Fra...",en,457136.0


In [6]:
len(main_df)

1419

In [7]:
from copy import deepcopy
X = deepcopy(temp['Cast and crew'][0])
print(X)
Y = deepcopy(temp['Cast and crew'][497])
Y

Anna Foerster (director); Cory Goodman (screenplay); Kate Beckinsale, Theo James, Lara Pulver, James Faulkner, Charles Dance


'Nadine Labaki (director/screenplay); Zain Al Rafeea, Yordanos Shiferaw, Boluwatife Treasure Bankole, Kawthar Al Haddad'

In [8]:
from copy import deepcopy

# # rename the cast and crew to only cast.
# # cast contains actors/actress names in the leading roles.

main_df.rename(columns = {'Cast and crew': 'cast'}, inplace = True)

def get_director(x):
    X = deepcopy(x)
    if "(director)" in x:
        return X.split(" (director); ")[0]
    elif "(directors)" in x:
        return X.split(" (directors); ")[0]
    else:
        return X.split(" (director/screenplay); ")[0]

main_df['crew'] = main_df['cast'].apply(lambda x: get_director(x))

def get_cast(x):
    X = deepcopy(x)
    if "(director/screenplay)" in X:
        try:
            return X.split(' (director/screenplay); ')[1].split(', ')[0]
        except:
            return ""
    if "(director)" in X:
        try:
            X = X.split(" (director); ")[1]
        except:
            return ""
    if "(directors)" in X:
        try:
            X = X.split(" (directors); ")[1]
        except:
            return ""
    try:
        return X.split(' (screenplay); ')[1].split(", ")[0]
    except:
        return ""

main_df['cast'] = main_df['cast'].apply(lambda x: get_cast(x))

In [9]:
main_df

Unnamed: 0,Title,cast,genres,overview,original_language,movie_id,crew
0,Underworld: Blood Wars,Kate Beckinsale,Fantasy Action Thriller Horror,Vampire death dealer Selene fends off brutal a...,en,346672.0,Anna Foerster
1,Arsenal,Adrian Grenier,Thriller,After the deadbeat brother of a businessman is...,en,388202.0,Steven C. Miller
2,Between Us,Olivia Thirlby,Romance Adventure Drama,"Stranded after a tragic plane crash, two stran...",en,290512.0,Rafael Palacio Illingworth
3,Monster Trucks,Lucas Till,Action Comedy Science Fiction,Tripp is a high school senior with a knack for...,en,262841.0,Chris Wedge
4,The Bye Bye Man,Douglas Smith,Horror Thriller,When three college students move into an old h...,en,292280.0,Stacy Title
...,...,...,...,...,...,...,...
1434,"Are You There God? It's Me, Margaret",Abby Ryder Fortson,Drama Comedy,When her family moves from the city to the sub...,en,555285.0,Kelly Fremon Craig
1435,Distant,Anthony Ramos,Western,"After destroying a Seminole fort, American sol...",en,61391.0,Josh Gordon & Will Speck
1436,Don't Worry Darling,Florence Pugh,Drama Mystery Thriller,A housewife living in a utopian community in t...,en,619730.0,Olivia Wilde
1437,Puss in Boots: The Last Wish,,Animation Adventure Comedy Family Fantasy Drama,Puss in Boots discovers that his passion for a...,en,315162.0,Joel Crawford


In [10]:
# create the final tag column on which basis we will prepare our model.
main_df['final_tag'] = main_df['overview'] + main_df['original_language'] 

In [11]:
final_df = main_df[['Title', 'cast', 'crew', 'genres', 'final_tag', 'movie_id']]
final_df

Unnamed: 0,Title,cast,crew,genres,final_tag,movie_id
0,Underworld: Blood Wars,Kate Beckinsale,Anna Foerster,Fantasy Action Thriller Horror,Vampire death dealer Selene fends off brutal a...,346672.0
1,Arsenal,Adrian Grenier,Steven C. Miller,Thriller,After the deadbeat brother of a businessman is...,388202.0
2,Between Us,Olivia Thirlby,Rafael Palacio Illingworth,Romance Adventure Drama,"Stranded after a tragic plane crash, two stran...",290512.0
3,Monster Trucks,Lucas Till,Chris Wedge,Action Comedy Science Fiction,Tripp is a high school senior with a knack for...,262841.0
4,The Bye Bye Man,Douglas Smith,Stacy Title,Horror Thriller,When three college students move into an old h...,292280.0
...,...,...,...,...,...,...
1434,"Are You There God? It's Me, Margaret",Abby Ryder Fortson,Kelly Fremon Craig,Drama Comedy,When her family moves from the city to the sub...,555285.0
1435,Distant,Anthony Ramos,Josh Gordon & Will Speck,Western,"After destroying a Seminole fort, American sol...",61391.0
1436,Don't Worry Darling,Florence Pugh,Olivia Wilde,Drama Mystery Thriller,A housewife living in a utopian community in t...,619730.0
1437,Puss in Boots: The Last Wish,,Joel Crawford,Animation Adventure Comedy Family Fantasy Drama,Puss in Boots discovers that his passion for a...,315162.0


In [12]:
final_df.to_csv('Data_2017-2022.csv', index=False)