## IMDB Scrape
- pull film metadata from IMDB
    - MPAA Rating
    - Genres

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import re

### load titles

In [3]:
wiki_titles = pd.read_excel('wiki_film_releases.xlsx')

print(wiki_titles.shape)
wiki_titles.head(3)

(1471, 8)


Unnamed: 0,title,title_id,release_date,distributor,wikipedia_url,budget,opening_locs,franchise
0,Weapons,1471,2025-08-08,WB,https://en.wikipedia.org/wiki/Weapons_(2025_film),,,
1,"Bad Guys 2, The",1470,2025-08-01,UNI,https://en.wikipedia.org/wiki/The_Bad_Guys_2,,,
2,How to Train Your Dragon,1469,2025-06-13,UNI,https://en.wikipedia.org/wiki/How_to_Train_You...,,,


In [4]:
wiki_titles = wiki_titles[['title','distributor','wikipedia_url','title_id',
                           'release_date','franchise']]

wiki_titles['release_date'] = pd.to_datetime(wiki_titles['release_date'])

wiki_titles.rename(columns={'wikipedia_url':'wiki_url'}, inplace=True)

## ignore titles w/o wiki url
wiki_titles.dropna(subset=['wiki_url'], inplace=True)
wiki_titles = wiki_titles[wiki_titles['wiki_url'] != '']

print(wiki_titles.shape)
print(wiki_titles['title'].nunique())
wiki_titles.head()

(1452, 6)
1414


Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise
0,Weapons,WB,https://en.wikipedia.org/wiki/Weapons_(2025_film),1471,2025-08-08,
1,"Bad Guys 2, The",UNI,https://en.wikipedia.org/wiki/The_Bad_Guys_2,1470,2025-08-01,
2,How to Train Your Dragon,UNI,https://en.wikipedia.org/wiki/How_to_Train_You...,1469,2025-06-13,
3,Bring Her Back,A24,https://en.wikipedia.org/wiki/Bring_Her_Back,1468,2025-05-30,
4,Lilo & Stitch,DIS,https://en.wikipedia.org/wiki/Lilo_%26_Stitch_...,1467,2025-05-23,


In [78]:
wiki_titles['title'] = wiki_titles['title'].astype(str)

In [375]:
### swap the from end to beginning of text
def clean_title_endswith(title):
    title = str(title).strip()
    if title.endswith(', The'):
        return 'The ' + title[:-5].strip()
    elif title.endswith(', A'):
        return 'A ' + title[:-3].strip()
    else:
        return title

In [377]:
wiki_titles['title'] = wiki_titles['title'].apply(lambda x: clean_title_endswith(x))

In [378]:
wiki_df = wiki_titles.copy()

In [379]:
wiki_df['title'].nunique()

1414

In [380]:
wiki_df[wiki_df['title'].isna()]

Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise


In [381]:
### filter to titles released prior to cutoff date
wiki_df = wiki_df[wiki_df['release_date'] > '2015-07-01']

print(wiki_df.shape)
print(wiki_df.title.nunique())

(1394, 6)
1356


### pull film metadata with OMDB api

In [209]:
import requests
import json
from fake_useragent import UserAgent
from PyMovieDb import IMDB


In [206]:
ua = UserAgent()
user_agent = ua.random

m1 = requests.get(f"http://www.omdbapi.com/?apikey={api_key}&t='Jane Austen Wrecked My Life'&type=movie&r=json",
               headers ={'User-Agent':user_agent})

In [396]:
def get_omdb_api_film_meta(df):
    
    ua = UserAgent()
    user_agent = ua.random
    
    film_meta = pd.DataFrame()

    ## get title
    titles_ = df['title'].unique().tolist()
    
    for t in titles_:
        print(t)
        tmp = df[df['title'] == t][['title','title_id','release_date']].drop_duplicates()
        
        year = pd.to_datetime(tmp['release_date']).dt.year.values[0]
#         print(year)
        
        ## remove comma
        t = re.sub("'|,","",t)
    
        tmp_meta = requests.get(f'http://www.omdbapi.com/?apikey={api_key}&t="{t}"&y={year}&type=movie&r=json',
                                headers ={'User-Agent':user_agent})
        
        
        try:
            tmp_meta = pd.DataFrame.from_dict(json.loads(tmp_meta.content))[
                ['Title','Year','Rated','Released','Runtime','Genre','imdbID']].drop_duplicates()
            tmp_meta['title'] = t
            tmp_meta['title_id'] = tmp.iloc[0]['title_id']
            
        except Exception as e:
            print(e)
            tmp_meta = pd.DataFrame()
            
        film_meta = pd.concat([film_meta, tmp_meta])
        
    return film_meta



def get_imdb_api_film_id(df, with_year):
    
    film_meta = pd.DataFrame()
    imdb = IMDB()

    ## get title
    titles_ = df['title'].unique().tolist()
    
    for t in titles_:
        print(t)
        tmp = df[df['title'] == t][['title','title_id','release_date']].drop_duplicates()
        
        ## remove comma
        t = re.sub("'|,","",t)
        
        if with_year:
            year = int(pd.to_datetime(tmp['release_date']).dt.year.values[0])
#         print(year)

            tmp_search = imdb.search(t, year)
        else:
            tmp_search = imdb.search(t)
        
        try:
            tmp_meta = pd.DataFrame(imdb.search_results['results']).drop_duplicates()
            tmp_meta['title'] = t
            tmp_meta['title_id'] = tmp.iloc[0]['title_id']
            
        except Exception as e:
            print(e)
            tmp_meta = pd.DataFrame()
            
        film_meta = pd.concat([film_meta, tmp_meta])
        
    return film_meta


def get_imdb_api_film_meta(df):
    
    film_meta = pd.DataFrame()
    imdb = IMDB()

    ## get imdb_id
    ids_ = df['imdb_id'].unique().tolist()
    
    for id_ in ids_:
        print(id_)
        tmp = df[df['imdb_id'] == id_][['title','imdb_id','title_id']].drop_duplicates()
        tmp_meta = imdb.get_by_id(id_)
        
        try:            
            tmp_meta = pd.DataFrame([json.loads(tmp_meta)])
            tmp_meta['title'] = tmp.iloc[0]['title']
            tmp_meta['title_id'] = tmp.iloc[0]['title_id']
            tmp_meta['imdb_id'] = tmp.iloc[0]['imdb_id']
            
        except Exception as e:
            print(e)
            tmp_meta = pd.DataFrame()
            
        film_meta = pd.concat([film_meta, tmp_meta])
        
    return film_meta

In [385]:
wiki_df.head(5)

Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise
0,Weapons,WB,https://en.wikipedia.org/wiki/Weapons_(2025_film),1471,2025-08-08,
1,The Bad Guys 2,UNI,https://en.wikipedia.org/wiki/The_Bad_Guys_2,1470,2025-08-01,
2,How to Train Your Dragon,UNI,https://en.wikipedia.org/wiki/How_to_Train_You...,1469,2025-06-13,
3,Bring Her Back,A24,https://en.wikipedia.org/wiki/Bring_Her_Back,1468,2025-05-30,
4,Lilo & Stitch,DIS,https://en.wikipedia.org/wiki/Lilo_%26_Stitch_...,1467,2025-05-23,


### pull imdb ids

In [706]:
imdb_ids = get_imdb_api_film_id(wiki_df, with_year=False)

Weapons
The Bad Guys 2
How to Train Your Dragon
Bring Her Back
Lilo & Stitch
Final Destination: Bloodlines
Jane Austen Wrecked My Life
Friendship
Shadow Force
The Surfer
Thunderbolts*
On Swift Horses
Until Dawn
The Accountant 2
Sinners
The Legend of Ochi
The Wedding Banquet
Warfare
Drop
The Amateur
A Minecraft Movie
Hell of a Summer
The Penguin Lessons
A Working Man
The Ballad of Wallis Island
The Friend
The Woman in the Yard
Death of a Unicorn
The Alto Knights
Bob Trevino Likes It
Disney's Snow White
Opus
The Actor
Black Bag
Novocaine
On Becoming a Guinea Fowl
Seven Veils
Mickey 17
Riff Raff
Superboys of Malegaon
Last Breath
The Unbreakable Boy
The Monkey
Captain America: Brave New World
Paddington in Peru
Parthenope
Heart Eyes
Love Hurts
Companion
Dog Man
Love Me
Presence
Flight Risk
I'm Still Here
One of Them Days
Wolf Man
Den of Thieves 2: Pantera
Hard Truths
The Last Showgirl
2073
Babygirl
The Fire Inside
Nosferatu
Better Man
A Complete Unknown
The Room Next Door
The Brutalist
Son

Cats
Superintelligence
Star Wars: The Rise Of Skywalker
Bombshell
Black Christmas
Richard Jewell
Jumanji: The Next Level
A Hidden Life
Portrait Of A Lady On Fire
Little Joe
Brahms: The Boy II
In Fabric
Knives Out
Queen & Slim
Frozen 2
Ford v. Ferrari
Charlie's Angels
Waves
Honey Boy
Doctor Sleep
Harriet
Terminator: Dark Fate
Black And Blue
Synonyms
Frankie
The Last Full Measure
Bad Trip
The Aeronauts
Jojo Rabbit
The Lighthouse
Maleficent: Mistress Of Evil
Zombieland 2: Double Tap
Greener Grass
By The Grace Of God (Grace a Dieu)
Jay & Silent Bob Reboot
Mister America
The Sky Is Pink
Gemini Man
The Addams Family
Jexi
Parasite
Matthias & Maxime
Joker
Pain And Glory
The Current War
Wrinkles The Clown
Low Tide
Abominable
The Death of Dick Long
The Day Shall Come
The Hunt
21 Bridges
Judy
The Golden Glove
Out Of Liberty
Villains
Ad Astra
The Zoya Factor
The Wedding Year
Rambo: Last Blood
Downton Abbey
Corporate Animals
Another Day Of Life
Dream Girl
Depraved
Monos
Can You Keep A Secret?
The S

Annabelle: Creation
The Glass Castle
Ingrid Goes West
Good Time
The Trip To Spain
Kidnap
Step
Wind River
Inconvenient Sequel: Truth To Power, An
Menashe
Detroit
Atomic Blonde
Girls Trip
Dunkirk
Wish Upon
War For The Planet Of The Apes
Endless Poetry
Spider-Man: Homecoming
A Ghost Story
Despicable Me 3
The Little Hours
The House
Baby Driver
The Beguiled
The Big Sick
Transformers: The Last Knight
Journey, The (2017)
The Book Of Henry
All Eyez On Me
Cars 3
Rough Night
My Cousin Rachel
Megan Leavey
Beatriz At Dinner
It Comes At Night
3 Idiotas
Wonder Woman
The Exception
Captain Underpants: The First Epic Movie
Band Aid
Dean
Buena Vista Social Club: Adios
Pirates Caribbean:Dead Men Tell No Tales
Baywatch
Diary Of A Wimpy Kid: The Long Haul
Alien: Covenant
Wakefield
Everything, Everything
Snatched
King Arthur: Legend Of The Sword
The Lovers
Guardians Of The Galaxy Vol. 2
Chuck
How To Be A Latin Lover
Born In China
Free Fire
Unforgettable
The Lost City Of Z
Gifted
Smurfs: The Lost Village
Goi

In [707]:
print(imdb_ids.shape)
print(imdb_ids['title_id'].nunique())
imdb_ids.head()

(5283, 6)
1349


Unnamed: 0,id,name,url,poster,title,title_id
0,tt26581740,"Weapons 2025 Toby Huss, Julia Garner",https://www.imdb.com/title/tt26581740/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BNTBhNW...,Weapons,1471
1,tt0497470,"Weapons 2007 Nick Cannon, Paul Dano",https://www.imdb.com/title/tt0497470/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BMTk1ND...,Weapons,1471
2,tt0069952,"Deadly Weapons 1974 Chesty Morgan, Harry Reems",https://www.imdb.com/title/tt0069952/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BNjAzOT...,Weapons,1471
3,tt0089983,"Secret Weapons 1985 TV Movie Sally Kellerman, ...",https://www.imdb.com/title/tt0089983/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BNjMxN2...,Weapons,1471
4,tt0188083,"Weapons of Death 1977 Leonard Mann, Ida Galli",https://www.imdb.com/title/tt0188083/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BOWUyMG...,Weapons,1471


In [708]:
### keep first row...best match
### anything incorrect will search again
imdb_ids.drop_duplicates('title_id', keep='first', inplace=True)

In [709]:
imdb_ids.rename(columns={'id':'imdb_id'}, inplace=True)

In [710]:
imdb_ids = imdb_ids[['imdb_id','name','title','title_id']]

In [711]:
print(imdb_ids.shape)
print(imdb_ids['title_id'].nunique())
imdb_ids.head()

(1349, 4)
1349


Unnamed: 0,imdb_id,name,title,title_id
0,tt26581740,"Weapons 2025 Toby Huss, Julia Garner",Weapons,1471
0,tt30017619,"The Bad Guys 2 2025 Sam Rockwell, Marc Maron",The Bad Guys 2,1470
0,tt26743210,"How to Train Your Dragon 2025 Mason Thames, Ni...",How to Train Your Dragon,1469
0,tt32246771,"Bring Her Back 2025 Billy Barratt, Sally Hawkins",Bring Her Back,1468
0,tt11655566,"Lilo & Stitch 2025 Billy Magnussen, Hannah Wad...",Lilo & Stitch,1467


In [712]:
imdb_ids.tail(50)

Unnamed: 0,imdb_id,name,title,title_id
0,tt3077214,"Suffragette 2015 Carey Mulligan, Anne-Marie Duff",Suffragette,108
0,tt17720272,"Goosebumps 2023– TV Series Zack Morris, Isa Br...",Goosebumps,107
0,tt3682448,"Bridge of Spies 2015 Tom Hanks, Mark Rylance",Bridge Of Spies,106
0,tt2554274,"Crimson Peak 2015 Mia Wasikowska, Jessica Chas...",Crimson Peak,105
0,tt3170832,"Room 2015 Brie Larson, Jacob Tremblay",Room,104
0,tt2080374,"Steve Jobs 2015 Michael Fassbender, Kate Winslet",Steve Jobs,103
0,tt3332064,"Pan 2015 Levi Miller, Hugh Jackman",Pan,102
0,tt21638826,"Dope Thief 2025– TV Series Brian Tyree Henry, ...",Ladrones,101
0,tt3659388,"The Martian 2015 Matt Damon, Jessica Chastain",The Martian,100
0,tt3397884,"Sicario 2015 Emily Blunt, Josh Brolin",Sicario,99


In [723]:
## which ids are missing....search is poor for short titles

wiki_df[~wiki_df['title_id'].isin(imdb_ids['title_id'])]

Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise
339,65,Sony,https://en.wikipedia.org/wiki/65_(film),1132,2023-03-10,0.0
372,EO,Sideshow,https://en.wikipedia.org/wiki/EO_(film),1099,2022-11-18,
562,Blumhouse Fantasy Island Horror,Sony,https://en.wikipedia.org/wiki/Fantasy_Island_(...,909,2020-02-14,
650,And the Birds Rained Down (Il pleuvait des ois...,MK2 Mile End,https://en.wikipedia.org/wiki/And_the_Birds_Ra...,821,2019-09-13,
695,Ma,Universal,https://en.wikipedia.org/wiki/Ma_(2019_film),775,2019-05-31,0.0
696,Ma,Universal,https://en.wikipedia.org/wiki/Ma_(film),775,2019-05-31,0.0
718,Us,Universal,https://en.wikipedia.org/wiki/Us_(2019_film),753,2019-03-22,0.0
880,Princess Mononoke - Studio Ghibli (2018),,https://en.wikipedia.org/wiki/Princess_Mononoke,591,2018-07-22,
1156,Gifted,,https://en.wikipedia.org/wiki/Gifted_(film),315,2017-04-07,0.0


In [736]:
missing_ids = pd.DataFrame([['tt19652910','EO','EO',1099],
                             ['tt12261776','65','65',1132],
                             ['tt0983946','Fantasy Island','Blumhouse Fantasy Island Horror',909],
                             ['tt7958736','Ma','Ma',775],
                             ['tt6857112','Us','Us',753],
                             ['tt4481414','Gifted','Gifted',315]],
                          columns=['imdb_id','name','title','title_id'])
missing_ids

Unnamed: 0,imdb_id,name,title,title_id
0,tt19652910,EO,EO,1099
1,tt12261776,65,65,1132
2,tt0983946,Fantasy Island,Blumhouse Fantasy Island Horror,909
3,tt7958736,Ma,Ma,775
4,tt6857112,Us,Us,753
5,tt4481414,Gifted,Gifted,315


In [738]:
imdb_ids = pd.concat([imdb_ids, missing_ids], ignore_index=True)

### pull imdb meta

In [740]:
imdb_meta = get_imdb_api_film_meta(imdb_ids)

tt26581740
tt30017619
tt26743210
tt32246771
tt11655566
tt9619824
tt31112509
tt30505698
tt11092020
tt27813235
tt20969586
tt15042300
tt30955489
tt7068946
tt31193180
tt8866456
tt32214413
tt31434639
tt32149847
tt0899043
tt3566834
tt21267296
tt26677014
tt9150192
tt27674982
tt18108824
tt31314296
tt28443655
tt21815562
tt28613536
tt6208148
tt29929565
tt14127944
tt30988739
tt29603959
tt32083311
tt26671415
tt12299608
tt27775403
tt28007064
tt14403504
tt5181260
tt27714946
tt14513804
tt5822536
tt23853982
tt32558992
tt30788842
tt26584495
tt10954718
tt21375602
tt28249919
tt10078772
tt14961016
tt32221196
tt4216984
tt8008948
tt11891850
tt31193791
tt22042346
tt30057084
tt6133444
tt5040012
tt14260836
tt11563598
tt29439114
tt8999762
tt18259086
tt13186482
tt28082769
tt8790086
tt14824600
tt13232552
tt23055660
tt15551346
tt3864060
tt12810074
tt27218960
tt19861162
tt13622970
tt24176060
tt32178949
tt9218128
tt1262426
tt14948432
tt2347285
tt28015403
tt27196021
tt21823606
tt18272208
tt27403986
tt20215234
tt16366

tt5437928
tt3829266
tt6998518
tt4537896
tt6040662
tt7040874
tt7339792
tt5814060
tt5702446
tt6850820
tt13444408
tt7948540
tt6859762
tt5208252
tt5709188
tt3127022
tt1308728
tt7668870
tt0070511
tt5779372
tt6443294
tt4244998
tt4560436
tt5607096
tt1442462
tt3104988
tt8590896
tt5690360
tt4779682
tt7349662
tt4080956
tt7365604
tt8564902
tt4575576
tt6663582
tt4073790
tt8116428
tt6933454
tt4912910
tt7424200
tt7242142
tt3766354
tt8264546
tt4761916
tt6911608
tt6510332
tt5220122
tt6288124
tt5758778
tt7014006
tt5688932
tt5095030
tt1850458
tt6133466
tt5052474
tt7334528
tt6452574
tt7664504
tt3892172
tt4881806
tt4602066
tt2854926
tt7431594
tt3606756
tt0116441
tt7690670
tt7784604
tt7681902
tt1494839
tt7158430
tt5834262
tt6499752
tt6212478
tt6306064
tt5842616
tt6495770
tt3778644
tt6857166
tt6053438
tt5463162
tt6915100
tt5691024
tt1667321
tt4682136
tt7098658
tt5619332
tt0903747
tt0093693
tt6580564
tt3203528
tt5610554
tt7689964
tt4154756
tt6108178
tt5670152
tt0859635
tt6791096
tt6980546
tt6217608
tt2231461

In [741]:
### only need 
imdb_meta2 = imdb_meta[['imdb_id','type','name','contentRating','datePublished','genre','title','title_id']]

## merge in release date from wiki to compare
imdb_meta2 = pd.merge(imdb_meta2, wiki_df[['title_id','release_date']].drop_duplicates(), on='title_id')

print(imdb_meta2.shape)
imdb_meta2.head(3)

(1340, 9)


Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt26581740,Movie,Weapons,R,2025-08-08,"[Horror, Mystery]",Weapons,1471,2025-08-08
1,tt30017619,,,,,,The Bad Guys 2,1470,2025-08-01
2,tt26743210,Movie,How to Train Your Dragon,PG,2025-06-13,"[Action, Adventure, Comedy]",How to Train Your Dragon,1469,2025-06-13


In [747]:
### what's misisng...
wiki_df[~wiki_df['title_id'].isin(imdb_meta2['title_id'])]

Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise
445,Dog,United Artists Releasing,https://en.wikipedia.org/wiki/Dog_(2022_film),1026,2022-02-18,0.0
484,Dune,Warner Bros.,https://en.wikipedia.org/wiki/Dune_(2021_film),987,2021-10-22,0.0
551,Escape Room 2,Sony,https://en.wikipedia.org/wiki/Escape_Room:_Tou...,920,2020-08-14,1.0
561,Sonic The Hedgehog,Paramount,https://en.wikipedia.org/wiki/Sonic_the_Hedgeh...,910,2020-02-14,0.0
562,Blumhouse Fantasy Island Horror,Sony,https://en.wikipedia.org/wiki/Fantasy_Island_(...,909,2020-02-14,
650,And the Birds Rained Down (Il pleuvait des ois...,MK2 Mile End,https://en.wikipedia.org/wiki/And_the_Birds_Ra...,821,2019-09-13,
677,The Lion King,Disney,https://en.wikipedia.org/wiki/The_Lion_King_(2...,794,2019-07-19,1.0
691,Late Night,Amazon Studios,https://en.wikipedia.org/wiki/Late_Night_(film),780,2019-06-07,0.0
709,Little,Universal,https://en.wikipedia.org/wiki/Little_(film),762,2019-04-12,0.0
880,Princess Mononoke - Studio Ghibli (2018),,https://en.wikipedia.org/wiki/Princess_Mononoke,591,2018-07-22,


In [752]:
wiki_df['title_id'].nunique()

1357

In [753]:
imdb_meta2[imdb_meta2['title'] == 'EO']

Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
1336,tt19652910,Movie,Eo,Not Rated,2023-01-21,[Drama],EO,1099,2022-11-18


In [754]:
imdb_meta2[imdb_meta2['title'].str.contains('Broly')]

Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
698,tt30277682,Movie,Dragon Ball Z: Broly - The Legendary Super Sai...,,,"[Animation, Short, Action]",Dragon Ball Super: Broly,720,2019-01-16


### find exact matches

In [755]:
### identify which titles are missing...or incorrect using release date
imdb_need_correction = imdb_meta2[(imdb_meta2['datePublished'] != imdb_meta2['release_date']) | 
                                  (imdb_meta2['type'] != 'Movie')]


print(imdb_need_correction.shape)
imdb_need_correction.head()

(492, 9)


Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
1,tt30017619,,,,,,The Bad Guys 2,1470,2025-08-01
6,tt31112509,Movie,Jane Austen a gâché ma vie,R,2025-05-30,[Romance],Jane Austen Wrecked My Life,1465,2025-05-16
15,tt8866456,Movie,The Legend of Ochi,PG,2025-04-25,"[Adventure, Family, Fantasy]",The Legend of Ochi,1455,2025-04-18
24,tt27674982,Movie,The Ballad of Wallis Island,PG-13,2025-04-18,"[Comedy, Drama, Music]",The Ballad of Wallis Island,1446,2025-03-28
25,tt18108824,Movie,The Friend,R,2025-04-04,"[Comedy, Drama]",The Friend,1445,2025-03-28


In [406]:
# Set the option to display all rows
pd.set_option('display.max_rows', None)

# # Optionally, reset the display option to default
# pd.set_option('display.max_rows', 10)

In [758]:
## keep ids for missing 
missing_titles = imdb_need_correction[imdb_need_correction['name'].isna()]['title_id'].tolist()

In [759]:
### review this manually for ok match or not
imdb_need_correction[~imdb_need_correction['title_id'].isin(missing_titles)]

Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
6,tt31112509,Movie,Jane Austen a gâché ma vie,R,2025-05-30,[Romance],Jane Austen Wrecked My Life,1465,2025-05-16
15,tt8866456,Movie,The Legend of Ochi,PG,2025-04-25,"[Adventure, Family, Fantasy]",The Legend of Ochi,1455,2025-04-18
24,tt27674982,Movie,The Ballad of Wallis Island,PG-13,2025-04-18,"[Comedy, Drama, Music]",The Ballad of Wallis Island,1446,2025-03-28
25,tt18108824,Movie,The Friend,R,2025-04-04,"[Comedy, Drama]",The Friend,1445,2025-03-28
29,tt28613536,Movie,Bob Trevino Likes It,PG-13,2025-03-20,"[Comedy, Drama]",Bob Trevino Likes It,1441,2025-03-21
35,tt32083311,Movie,On Becoming a Guinea Fowl,PG-13,2024-12-06,"[Comedy, Drama]",On Becoming a Guinea Fowl,1435,2025-03-07
45,tt23853982,Movie,Parthenope,R,2024-10-24,"[Drama, Fantasy]",Parthenope,1424,2025-02-07
53,tt14961016,Movie,Ainda Estou Aqui,PG-13,2025-02-07,"[Biography, Drama, History]",Im Still Here,1415,2025-01-17
63,tt14260836,Movie,Better Man,R,2025-01-10,"[Biography, Fantasy, Music]",Better Man,1405,2024-12-25
65,tt29439114,Movie,The Room Next Door,PG-13,2024-10-18,[Drama],The Room Next Door,1403,2024-12-20


In [760]:
imdb_meta2[imdb_meta2['title_id'] == 1282]

Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
168,tt26739710,Movie,Backspot,,2024-09-16,"[Drama, Sport]",Backspot,1282,2024-05-31


In [761]:
review_titles = [1279, 1238, 1235, 1162, 1128, 1106, 1088, 1078,
                 1064,1063,1062, 1019, 1018, 1012,
                 998,988,955,953,948,937,923,902,898,884,
                 874,866,856,852,827,810,804,798,
                 785,778,757,736,721,720,704,683, 675,673,
                 662,655,642,637,635,577,561,519,503,
                 445,433,353,331,324,322,320,318,309,308,
                 286,270,246,243,207,166,159,156,141,
                 138,107,101,96,75,65,]

In [410]:
# review_titles = [1470, 1464, 1462, 
#                  1452, 1448, 1445,1440,1432,
#                  1424,1423,1418,1417,1411,1409,1406,1405,
#                  1395, 1356,1351,1350,1341,1336,
#                  1313,1309,1300,1282,1279,1273,
#                  1260,1256,1254,1253,1238,1220,1216,
#                  1206,1198,1188,1177,1168,1156,1152,
#                  1149,1144,1141,1135,1128,1127,
#                  1119,1118,1102,1079,1062,1056,
#                  1033,1030,1027,1019,1017,1014,
#                  1012,1001,988,976,963,960,950,948,947,
#                  944,922,919,917,910,909,905,900,
#                  891,881,879,868,864,856,840,820,
#                  816,810,798,796,794,788,773,772,760,
#                  759,750,748,746,727,725,719,717,715,701,688,
#                  675,654,642,640,635,627,614,611,610,607,588,585,
#                  575,564,551,539,535,531,521,519,518,
#                  464,460,459,436,424,422,403,402,383,379,342,337,
#                  334,330,325,324,310,308,304,292,264,203,197,
#                  194,184,178,177,176,173,168,166,159,152,147,
#                  131,129,127,122,112,101,98,97,91,81,71,69,68, 65,]

In [762]:
### identify ok titles
imdb_ok = imdb_meta2[(imdb_meta2['datePublished'] == imdb_meta2['release_date']) | 
                     ((imdb_meta2['datePublished'] != imdb_meta2['release_date'])) & 
                     (~imdb_meta2['title_id'].isin(review_titles) &
                     (~imdb_meta2['title_id'].isin(missing_titles)))]

print(imdb_ok.shape)
imdb_ok.head()

(1191, 9)


Unnamed: 0,imdb_id,type,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt26581740,Movie,Weapons,R,2025-08-08,"[Horror, Mystery]",Weapons,1471,2025-08-08
2,tt26743210,Movie,How to Train Your Dragon,PG,2025-06-13,"[Action, Adventure, Comedy]",How to Train Your Dragon,1469,2025-06-13
3,tt32246771,Movie,Bring Her Back,R,2025-05-30,"[Horror, Mystery]",Bring Her Back,1468,2025-05-30
4,tt11655566,Movie,Lilo & Stitch,PG,2025-05-23,"[Action, Adventure, Comedy]",Lilo & Stitch,1467,2025-05-23
5,tt9619824,Movie,Final Destination: Bloodlines,R,2025-05-16,[Horror],Final Destination: Bloodlines,1466,2025-05-16


In [763]:
## identify titles that need to be corrected
# imdb_need_correction = imdb_need_correction[~imdb_need_correction['title_id'].isin(
#     imdb_ok['title_id'])]

imdb_need_correction = imdb_need_correction[~imdb_need_correction['title_id'].isin(
    imdb_ok['title_id'])]

In [764]:
imdb_need_correction.shape

(149, 9)

In [766]:
imdb_nc_ids = get_imdb_api_film_id(imdb_need_correction, with_year=False)

The Bad Guys 2
The Unbreakable Boy
Captain America: Brave New World
Wolf Man
Nosferatu
Here
Daddio
Inside Out 2
Solo
Immaculate
Origin
Mean Girls
Napoleon
Priscilla
Barbie
Elemental
The Machine
Emily
Knock at the Cabin
80 for Brady
Missing
The Son
Till
Lyle Lyle Crocodile
Tar
Gods Country
Bodies Bodies Bodies
Vengeance
Jurassic World Dominion
Men
The Worst Person in the World
Scream
The 355
The Kings Man
Cmon Cmon
The French Dispatch
The Card Counter
Mortal Kombat
Voyagers
The Unholy
Tom & Jerry
Freaky
The Nest
The Turning
The Grudge (2020)
Cats
Portrait Of A Lady On Fire
Charlies Angels
Harriet
Frankie
Jay & Silent Bob Reboot
The Addams Family
Downton Abbey
Dream Girl
The Sound Of Silence
Dont Let Go
After The Wedding
Fast & Furious Presents: Hobbs & Shaw
Midsommar
Childs Play
The Fall Of The American Empire
Booksmart
Tolkien
Pet Sematary
Gloria Bell
To Dust
The Invisibles
Glass
Dragon Ball Super: Broly
Stan And Ollie
Vice
Zero
Cold War
They Shall Not Grow Old
Robin Hood
Dr. Seuss The

In [767]:
imdb_nc_ids

Unnamed: 0,id,name,url,poster,title,title_id
0,tt30017619,"The Bad Guys 2 2025 Sam Rockwell, Marc Maron",https://www.imdb.com/title/tt30017619/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BZTAyZD...,The Bad Guys 2,1470
1,tt5247200,Die Hard 2: The Bad Guys - Villains' Profile 1...,https://www.imdb.com/title/tt5247200/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BNjkzOD...,The Bad Guys 2,1470
2,tt8115900,"The Bad Guys 2022 Sam Rockwell, Marc Maron",https://www.imdb.com/title/tt8115900/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BODNlYm...,The Bad Guys 2,1470
3,tt8736086,Bring on the Bad Guys: Part Two 2018 S2.E9 TV ...,https://www.imdb.com/title/tt8736086/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BMjBiMz...,The Bad Guys 2,1470
4,tt1619404,Crooks 2: When the Bad Guys Have the Camera TV...,https://www.imdb.com/title/tt1619404/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BYzlkNT...,The Bad Guys 2,1470
0,tt5181260,"The Unbreakable Boy 2025 Zachary Levi, Meghann...",https://www.imdb.com/title/tt5181260/?ref_=fn_...,https://m.media-amazon.com/images/M/MV5BYjYzN2...,The Unbreakable Boy,1429
1,tt35838329,The Unbreakable Boy | Movie Review 2025 S7.E17...,https://www.imdb.com/title/tt35838329/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BZDJlNT...,The Unbreakable Boy,1429
2,tt26693752,"Family 2023 TV Series Jang Hyuk, Jang Na-ra",https://www.imdb.com/title/tt26693752/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BM2RiNj...,The Unbreakable Boy,1429
3,tt11957526,"The Unbreakable Bunch 2024 Ray Lloyd, Ernest M...",https://www.imdb.com/title/tt11957526/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BZjAzOG...,The Unbreakable Boy,1429
0,tt14513804,Captain America: Brave New World 2025 Anthony ...,https://www.imdb.com/title/tt14513804/?ref_=fn...,https://m.media-amazon.com/images/M/MV5BNDRjY2...,Captain America: Brave New World,1426


In [768]:
### keep first row...best match
### anything incorrect will be manually corrected
imdb_nc_ids.drop_duplicates('title_id', keep='first', inplace=True)
imdb_nc_ids.rename(columns={'id':'imdb_id'}, inplace=True)

imdb_nc_ids = imdb_nc_ids[['imdb_id','name','title','title_id']]

In [769]:
print(imdb_nc_ids.shape)
imdb_nc_ids.head()

(149, 4)


Unnamed: 0,imdb_id,name,title,title_id
0,tt30017619,"The Bad Guys 2 2025 Sam Rockwell, Marc Maron",The Bad Guys 2,1470
0,tt5181260,"The Unbreakable Boy 2025 Zachary Levi, Meghann...",The Unbreakable Boy,1429
0,tt14513804,Captain America: Brave New World 2025 Anthony ...,Captain America: Brave New World,1426
0,tt4216984,"Wolf Man 2025 Christopher Abbott, Julia Garner",Wolf Man,1413
0,tt5040012,"Nosferatu 2024 Lily-Rose Depp, Nicholas Hoult",Nosferatu,1406


In [770]:
## manual corrections for imdb id...
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1341, 'imdb_id'] = 'tt0327785'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1313, 'imdb_id'] = 'tt28490634'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1279, 'imdb_id'] = 'tt27715185'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1238, 'imdb_id'] = 'tt13321244'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1235, 'imdb_id'] = 'tt11762114'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1162, 'imdb_id'] = 'tt11040844'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1128, 'imdb_id'] = 'tt12374656'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1106, 'imdb_id'] = 'tt14458442'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1088, 'imdb_id'] = 'tt4960748'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1078, 'imdb_id'] = 'tt11818818'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1064, 'imdb_id'] = 'tt8110652'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1062, 'imdb_id'] = 'tt11976532'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1044, 'imdb_id'] = 'tt13841850'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1019, 'imdb_id'] = 'tt10370710'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1018, 'imdb_id'] = 'tt11245972'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 1012, 'imdb_id'] = 'tt6856242'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 998, 'imdb_id'] = 'tt10986222'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 955, 'imdb_id'] = 'tt0293429'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 953, 'imdb_id'] = 'tt9419056'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 948, 'imdb_id'] = 'tt1361336'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 937, 'imdb_id'] = 'tt10919380'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 923, 'imdb_id'] = 'tt8338762'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 910, 'imdb_id'] = 'tt3794354'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 902, 'imdb_id'] = 'tt7510346'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 888, 'imdb_id'] = 'tt10481868'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 884, 'imdb_id'] = 'tt8613070'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 879, 'imdb_id'] = 'tt8722346'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 874, 'imdb_id'] = 'tt5033998'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 866, 'imdb_id'] = 'tt8019694'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 856, 'imdb_id'] = 'tt6521876'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 852, 'imdb_id'] = 'tt1620981'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 840, 'imdb_id'] = 'tt8244784'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 830, 'imdb_id'] = 'tt6398184'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 827, 'imdb_id'] = 'tt9248940'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 810, 'imdb_id'] = 'tt6803212'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 804, 'imdb_id'] = 'tt7985692'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 798, 'imdb_id'] = 'tt6806448'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 794, 'imdb_id'] = 'tt6105098'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 778, 'imdb_id'] = 'tt7231342'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 762, 'imdb_id'] = 'tt8085790'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 757, 'imdb_id'] = 'tt0837563'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 725, 'imdb_id'] = 'tt6476140'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 721, 'imdb_id'] = 'tt6823368'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 720, 'imdb_id'] = 'tt7961060'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 708, 'imdb_id'] = 'tt6266538'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 704, 'imdb_id'] = 'tt6527426'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 683, 'imdb_id'] = 'tt4532826'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 675, 'imdb_id'] = 'tt2709692'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 673, 'imdb_id'] = 'tt5177088'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 655, 'imdb_id'] = 'tt1502407'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 642, 'imdb_id'] = 'tt2837574'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 637, 'imdb_id'] = 'tt7745068'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 635, 'imdb_id'] = 'tt8632862'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 621, 'imdb_id'] = 'tt6017942'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 613, 'imdb_id'] = 'tt5093026'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 607, 'imdb_id'] = 'tt3750872'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 577, 'imdb_id'] = 'tt5740866'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 565, 'imdb_id'] = 'tt1801552'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 561, 'imdb_id'] = 'tt5164214'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 543, 'imdb_id'] = 'tt7137846'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 519, 'imdb_id'] = 'tt5836316'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 518, 'imdb_id'] = 'tt5457078'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 503, 'imdb_id'] = 'tt7363076'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 484, 'imdb_id'] = 'tt6456222'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 445, 'imdb_id'] = 'tt3312830'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 433, 'imdb_id'] = 'tt4587656'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 413, 'imdb_id'] = 'tt7207238'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 368, 'imdb_id'] = 'tt5758404'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 353, 'imdb_id'] = 'tt4481514'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 348, 'imdb_id'] = 'tt6213758'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 331, 'imdb_id'] = 'tt1469304'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 324, 'imdb_id'] = 'tt1321869'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 322, 'imdb_id'] = 'tt1610525'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 320, 'imdb_id'] = 'tt4629266'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 318, 'imdb_id'] = 'tt3462710'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 309, 'imdb_id'] = 'tt5442430'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 308, 'imdb_id'] = 'tt1781058'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 286, 'imdb_id'] = 'tt0498381'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 279, 'imdb_id'] = 'tt1619029'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 270, 'imdb_id'] = 'tt3976144'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 246, 'imdb_id'] = 'tt1540011'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 243, 'imdb_id'] = 'tt4731008'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 207, 'imdb_id'] = 'tt1489889'

imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 113, 'imdb_id'] = 'tt2381111'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 107, 'imdb_id'] = 'tt1051904'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 96, 'imdb_id'] = 'tt3488710'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 166, 'imdb_id'] = 'tt3704050'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 159, 'imdb_id'] = 'tt4263482'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 138, 'imdb_id'] = 'tt2058673'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 101, 'imdb_id'] = 'tt4296800'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 75, 'imdb_id'] = 'tt1502712'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 74, 'imdb_id'] = 'tt2402101'


imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 69, 'imdb_id'] = 'tt2764784'
imdb_nc_ids.loc[imdb_nc_ids['title_id'] == 65, 'imdb_id'] = 'tt3399024'



In [776]:
imdb_nc_meta = get_imdb_api_film_meta(imdb_nc_ids)

tt30017619
tt5181260
tt14513804
tt4216984
tt5040012
tt18272208
tt7461686
tt22022452
tt27715185
tt23137390
tt13321244
tt11762114
tt13287846
tt22041854
tt1517268
tt15789038
tt11040844
tt12374656
tt15679400
tt18079362
tt10855768
tt14458442
tt4960748
tt14668630
tt14444726
tt11818818
tt8110652
tt11976532
tt8041270
tt13841850
tt10370710
tt11245972
tt8356942
tt6856242
tt10986222
tt8847712
tt11196036
tt0293429
tt9664108
tt9419056
tt1361336
tt10919380
tt8338762
tt7510346
tt16425368
tt5697572
tt8613070
tt5033998
tt4648786
tt8019694
tt6521876
tt1620981
tt6398184
tt9248940
tt9352926
tt6803212
tt7985692
tt6806448
tt8772262
tt5203760
tt7231342
tt1489887
tt3361792
tt0837563
tt6902696
tt7117594
tt14872806
tt6823368
tt7961060
tt3385524
tt6266538
tt6527426
tt6543652
tt7905466
tt4532826
tt2709692
tt5177088
tt7074886
tt14753120
tt1502407
tt5613484
tt4595882
tt2837574
tt7745068
tt8632862
tt6998518
tt6017942
tt6859762
tt5093026
tt3750872
tt5740866
tt5052474
tt5164214
tt7137846
tt5836316
tt5457078
tt1677720


In [777]:
### only need 
imdb_nc_meta2 = imdb_nc_meta[['imdb_id','name','contentRating','datePublished','genre','title','title_id']]

## merge in release date from wiki to compare
imdb_nc_meta2 = pd.merge(imdb_nc_meta2, wiki_df[['title_id','release_date']].drop_duplicates(), on='title_id')

print(imdb_nc_meta2.shape)
imdb_nc_meta2.head(3)

(149, 8)


Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt30017619,,,,,The Bad Guys 2,1470,2025-08-01
1,tt5181260,The Unbreakable Boy,PG,2025-02-21,[Drama],The Unbreakable Boy,1429,2025-02-21
2,tt14513804,Captain America: Brave New World,PG-13,2025-02-14,"[Action, Adventure, Sci-Fi]",Captain America: Brave New World,1426,2025-02-14


In [778]:
## check
imdb_nc_meta2

Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt30017619,,,,,The Bad Guys 2,1470,2025-08-01
1,tt5181260,The Unbreakable Boy,PG,2025-02-21,[Drama],The Unbreakable Boy,1429,2025-02-21
2,tt14513804,Captain America: Brave New World,PG-13,2025-02-14,"[Action, Adventure, Sci-Fi]",Captain America: Brave New World,1426,2025-02-14
3,tt4216984,Wolf Man,R,2025-01-17,[Horror],Wolf Man,1413,2025-01-17
4,tt5040012,Nosferatu,R,2024-12-25,"[Fantasy, Horror, Mystery]",Nosferatu,1406,2024-12-25
5,tt18272208,Here,PG-13,2024-11-01,[Drama],Here,1377,2024-11-01
6,tt7461686,Daddio,R,2024-06-27,[Drama],Daddio,1300,2024-06-28
7,tt22022452,,,,,Inside Out 2,1288,2024-06-14
8,tt27715185,Solo,,,"[Drama, Romance]",Solo,1279,2024-05-24
9,tt23137390,Immaculate,R,2024-03-22,"[Horror, Thriller]",Immaculate,1254,2024-03-22


In [779]:
imdb_nc_meta2[imdb_nc_meta2['title_id'] == 1406]

Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
4,tt5040012,Nosferatu,R,2024-12-25,"[Fantasy, Horror, Mystery]",Nosferatu,1406,2024-12-25


#### manually add in missing titles

In [797]:
## no meta found in api...
imdb_missing = imdb_nc_meta2[imdb_nc_meta2['genre'].isna()]

## titles from wiki df not present
wiki_missing = wiki_df[~(wiki_df['title_id'].isin(imdb_nc_meta2['title_id'])) & 
                       ~(wiki_df['title_id'].isin(imdb_ok['title_id']))][['title','title_id','release_date']]

missing_titles2 = pd.concat([imdb_missing, wiki_missing], ignore_index=True)
missing_titles2

Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt30017619,,,,,The Bad Guys 2,1470,2025-08-01
1,tt22022452,,,,,Inside Out 2,1288,2024-06-14
2,tt18079362,,,,,80 for Brady,1125,2023-02-03
3,tt9664108,,,,,Voyagers,954,2021-04-09
4,tt1361336,,,,,Tom & Jerry,948,2021-02-26
5,tt6266538,,,,,Vice,708,2018-12-25
6,tt2837574,,,,,The Old Man & The Gun,642,2018-09-28
7,tt8632862,,,,,Fahrenheit 11/9,635,2018-09-21
8,tt6998518,,,,,Mandy,629,2018-09-14
9,tt5740866,,,,,Whitney,577,2018-07-06


In [799]:
### add imdb ids...
missing_titles2.loc[missing_titles2['title_id'] == 1026, 'imdb_id'] = 'tt11252248'
missing_titles2.loc[missing_titles2['title_id'] == 987, 'imdb_id'] = 'tt1160419'
missing_titles2.loc[missing_titles2['title_id'] == 920, 'imdb_id'] = 'tt9844522'
missing_titles2.loc[missing_titles2['title_id'] == 910, 'imdb_id'] = 'tt3794354'
missing_titles2.loc[missing_titles2['title_id'] == 909, 'imdb_id'] = 'tt0983946'
missing_titles2.loc[missing_titles2['title_id'] == 821, 'imdb_id'] = 'tt10441486'
missing_titles2.loc[missing_titles2['title_id'] == 794, 'imdb_id'] = 'tt6105098'
missing_titles2.loc[missing_titles2['title_id'] == 780, 'imdb_id'] = 'tt6107548'
missing_titles2.loc[missing_titles2['title_id'] == 762, 'imdb_id'] = 'tt8085790'
missing_titles2.loc[missing_titles2['title_id'] == 327, 'imdb_id'] = 'tt5001718'
missing_titles2.loc[missing_titles2['title_id'] == 315, 'imdb_id'] = 'tt4481414'
missing_titles2.loc[missing_titles2['title_id'] == 277, 'imdb_id'] = 'tt3521164'


missing_titles2.loc[missing_titles2['title_id'] == 237, 'imdb_id'] = 'tt4230700'
missing_titles2.loc[missing_titles2['title_id'] == 187, 'imdb_id'] = 'tt2093991'
missing_titles2.loc[missing_titles2['title_id'] == 155, 'imdb_id'] = 'tt1431045'
missing_titles2.loc[missing_titles2['title_id'] == 120, 'imdb_id'] = 'tt3707106'


In [803]:
missing_titles2.dropna(subset=['imdb_id'], inplace=True)
missing_titles2.head()

Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt30017619,,,,,The Bad Guys 2,1470,2025-08-01
1,tt22022452,,,,,Inside Out 2,1288,2024-06-14
2,tt18079362,,,,,80 for Brady,1125,2023-02-03
3,tt9664108,,,,,Voyagers,954,2021-04-09
4,tt1361336,,,,,Tom & Jerry,948,2021-02-26


In [804]:
missing_still = get_imdb_api_film_meta(missing_titles2)

tt30017619
tt22022452
tt18079362
tt9664108
tt1361336
tt6266538
tt2837574
tt8632862
tt6998518
tt5740866
tt5052474
tt3371366
tt1781058
tt1051904
tt11252248
tt1160419
tt9844522
tt3794354
tt0983946
tt10441486
tt6105098
tt6107548
tt8085790
tt5001718
tt4481414
tt3521164
tt4230700
tt2093991
tt1431045
tt3707106


In [1053]:
### only need 
missing_still = missing_still[['imdb_id','name','contentRating','datePublished','genre','title','title_id']]

## merge in release date from wiki to compare
missing_still = pd.merge(missing_still, wiki_df[['title_id','release_date']].drop_duplicates(), on='title_id')
missing_still.dropna(inplace=True)

missing_still = pd.concat([missing_still, imdb_nc_meta2[(imdb_nc_meta2['genre'].isna())]], ignore_index=True)
missing_still.drop_duplicates(['title_id'], keep='first', inplace=True)

missing_still = pd.concat([missing_still, wiki_missing], ignore_index=True)
missing_still.drop_duplicates(['title_id'], keep='first', inplace=True)

print(missing_still.shape)
missing_still.head(3)

(31, 8)


Unnamed: 0,imdb_id,name,contentRating,datePublished,genre,title,title_id,release_date
0,tt22022452,Inside Out 2,PG,2024-06-14,"[Animation, Adventure, Comedy]",Inside Out 2,1288,2024-06-14
1,tt18079362,80 for Brady,PG-13,2023-02-03,"[Comedy, Drama, Sport]",80 for Brady,1125,2023-02-03
2,tt9664108,Voyagers,PG-13,2021-04-09,"[Adventure, Mystery, Sci-Fi]",Voyagers,954,2021-04-09


In [1057]:
## ignore remaining missing....
missing_still.dropna(subset=['name'], inplace=True)

### combine ok first run and corrected re-run

In [1159]:
final_meta = pd.concat([imdb_ok, imdb_nc_meta2, missing_still], ignore_index=True)

final_meta.drop(['name','datePublished'], axis=1, inplace=True)
final_meta.drop_duplicates(['title','title_id'], keep='last', inplace=True)

final_meta.set_index(['title','title_id'], inplace=True)
final_meta.reset_index(inplace=True)

final_meta.columns = [c.lower() for c in final_meta.columns]

print(final_meta.shape)
final_meta.head(3)

(1355, 7)


Unnamed: 0,title,title_id,imdb_id,type,contentrating,genre,release_date
0,Weapons,1471,tt26581740,Movie,R,"[Horror, Mystery]",2025-08-08
1,How to Train Your Dragon,1469,tt26743210,Movie,PG,"[Action, Adventure, Comedy]",2025-06-13
2,Bring Her Back,1468,tt32246771,Movie,R,"[Horror, Mystery]",2025-05-30


In [1160]:
final_meta['contentrating'].value_counts()

R            657
PG-13        418
PG           174
Not Rated     34
Unrated       13
TV-MA          5
G              4
TV-PG          3
TV-14          2
Name: contentrating, dtype: int64

In [1161]:
final_meta[final_meta['contentrating'] == 'TV-PG']

Unnamed: 0,title,title_id,imdb_id,type,contentrating,genre,release_date
267,The Flash,1166,tt3107288,TVSeries,TV-PG,"[Action, Adventure, Drama]",2023-06-16
925,Leap!,379,tt0096684,TVSeries,TV-PG,"[Action, Adventure, Drama]",2017-08-25
1297,Chips,307,tt0075488,,TV-PG,"[Crime, Drama, Mystery]",2017-03-24


In [1162]:
final_meta.loc[final_meta['title_id'] == 307, ['contentrating','imdb_id']] = ['R','tt0493405']

In [1163]:
final_meta.at[final_meta[final_meta['title_id'] == 307].index[0], 'genre'] = ['Action','Comedy','Crime']

In [1164]:
final_meta.loc[final_meta['title_id'] == 379, ['contentrating','imdb_id']] = ['PG','tt2261287']

In [1165]:
final_meta.at[final_meta[final_meta['title_id'] == 379].index[0], 'genre'] = ['Animation','Adventure','Family']

In [1166]:
final_meta.loc[final_meta['title_id'] == 1166, ['contentrating','imdb_id']] = ['PG-13','tt0439572']

In [1167]:
final_meta.at[final_meta[final_meta['title_id'] == 1166].index[0], 'genre'] = ['Action','Adventure','Fantasy']

In [1168]:
final_meta[final_meta['contentrating'] == 'TV-14']

Unnamed: 0,title,title_id,imdb_id,type,contentrating,genre,release_date
548,Another Day Of Life,828,tt0672303,TVEpisode,TV-14,[Drama],2019-09-13
1089,Criminal,181,tt0452046,TVSeries,TV-14,"[Crime, Drama, Mystery]",2016-04-15


In [1169]:
final_meta.loc[final_meta['title_id'] == 181, ['contentrating','imdb_id']] = ['R','tt3014866']

In [1170]:
final_meta.at[final_meta[final_meta['title_id'] == 181].index[0], 'genre'] = ['Action','Sci-Fi','Thriller']

In [1171]:
final_meta.loc[final_meta['title_id'] == 828, ['contentrating','imdb_id']] = ['Unrated','tt2967856']

In [1172]:
final_meta.at[final_meta[final_meta['title_id'] == 828].index[0], 'genre'] = ['Animation','Biography']

In [1173]:
final_meta['contentrating'].value_counts()

R            659
PG-13        419
PG           175
Not Rated     34
Unrated       14
TV-MA          5
G              4
Name: contentrating, dtype: int64

In [1174]:
## check if any missing from wiki...
wiki_df[~wiki_df['title_id'].isin(final_meta.title_id)]

Unnamed: 0,title,distributor,wiki_url,title_id,release_date,franchise
650,And the Birds Rained Down (Il pleuvait des ois...,MK2 Mile End,https://en.wikipedia.org/wiki/And_the_Birds_Ra...,821,2019-09-13,
880,Princess Mononoke - Studio Ghibli (2018),,https://en.wikipedia.org/wiki/Princess_Mononoke,591,2018-07-22,


### convert genres to columns

In [1175]:

## overwrite genre
final_meta.at[final_meta.loc[final_meta.title_id == 785].index[0], 'genre'] = ['Horror','Sci-Fi','Thriller']
final_meta.at[final_meta.loc[final_meta.title_id == 69].index[0], 'genre']=  ['Drama','History']
final_meta.at[final_meta.loc[final_meta.title_id == 501].index[0], 'genre'] = ['Drama','History','Action']
final_meta.at[final_meta.loc[final_meta.title_id == 174].index[0], 'genre'] = ['Action','Adventure','Fantasy']
final_meta.at[final_meta.loc[final_meta.title_id == 859].index[0], 'genre'] = ['Action','Comedy','Horror']
final_meta.at[final_meta.loc[final_meta.title_id == 642].index[0], 'genre'] = ['Biography','Crime','Drama']

In [1176]:
import ast

final_meta['genre_list'] = final_meta['genre'].dropna().apply(lambda x: ast.literal_eval(str(x)))
final_meta['genre_count'] = final_meta['genre_list'].dropna().apply(lambda x: len(x))

In [1177]:
set(final_meta['genre_list'].dropna().sum())

{'Action',
 'Adventure',
 'Animation',
 'Aninmation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [1228]:
len(['Action',
 'Adventure',
 'Animation',
 'Aninmation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'])

23

In [1229]:
final_meta['contentrating'].unique()

array(['R', 'PG', 'PG-13', None, 'Not Rated', 'TV-MA', 'Unrated', 'G',
       nan], dtype=object)

In [1178]:
print(final_meta.shape)
print(final_meta.title.nunique())
final_meta.head()

(1355, 9)
1354


Unnamed: 0,title,title_id,imdb_id,type,contentrating,genre,release_date,genre_list,genre_count
0,Weapons,1471,tt26581740,Movie,R,"[Horror, Mystery]",2025-08-08,"[Horror, Mystery]",2.0
1,How to Train Your Dragon,1469,tt26743210,Movie,PG,"[Action, Adventure, Comedy]",2025-06-13,"[Action, Adventure, Comedy]",3.0
2,Bring Her Back,1468,tt32246771,Movie,R,"[Horror, Mystery]",2025-05-30,"[Horror, Mystery]",2.0
3,Lilo & Stitch,1467,tt11655566,Movie,PG,"[Action, Adventure, Comedy]",2025-05-23,"[Action, Adventure, Comedy]",3.0
4,Final Destination: Bloodlines,1466,tt9619824,Movie,R,[Horror],2025-05-16,[Horror],1.0


In [1215]:
genres = final_meta.groupby(['title','title_id'])[['genre_list']].sum().reset_index()

In [1216]:
genre_dummies = pd.get_dummies(genres['genre_list'].explode(), dtype=int).groupby(level=0).max()
genre_dummies = genres[['title','title_id']].join(genre_dummies)


## remove empty genre col
genre_dummies.drop([0], axis=1, inplace=True)

print(genre_dummies.shape)
genre_dummies.head(3)

(1355, 25)


Unnamed: 0,title,title_id,Action,Adventure,Animation,Aninmation,Biography,Comedy,Crime,Documentary,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,10 Cloverfield Lane,169,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,102 Not Out,541,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12 Strong,468,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1217]:

## join sci-fi fantasy,
genre_dummies.loc[(genre_dummies[['Sci-Fi', 'Fantasy']].sum(axis=1) >= 1), 'SciFi_Fantasy'] = 1
genre_dummies['SciFi_Fantasy'].fillna(0, inplace=True)

## join action and adventure
genre_dummies.loc[(genre_dummies[['Action', 'Adventure']].sum(axis=1) >= 1), 'Action_Adventure'] = 1
genre_dummies['Action_Adventure'].fillna(0, inplace=True)

## join mystery/crime
genre_dummies.loc[(genre_dummies[['Mystery', 'Crime']].sum(axis=1) >= 1), 'Mystery_Crime'] = 1
genre_dummies['Mystery_Crime'].fillna(0, inplace=True)

# genre_dummies.loc[(genre_dummies[['Music', 'Musical']].sum(axis=1) >= 1), 'Music'] = 1

genre_dummies.loc[(genre_dummies[['Biography', 'History']].sum(axis=1) >= 1), 'Bio_History'] = 1
genre_dummies['Bio_History'].fillna(0, inplace=True)

## join horror/thriller
genre_dummies.loc[(genre_dummies[['Horror', 'Thriller']].sum(axis=1) >= 1), 'Horror_Thriller'] = 1
genre_dummies['Horror_Thriller'].fillna(0, inplace=True)


In [1218]:
## ignore these
genre_dummies = genre_dummies[~(genre_dummies[['Documentary','Talk-Show']].sum(axis=1) >= 1)]

In [1219]:
genre_dummies.columns

Index(['title', 'title_id', 'Action', 'Adventure', 'Animation', 'Aninmation',
       'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western',
       'SciFi_Fantasy', 'Action_Adventure', 'Mystery_Crime', 'Bio_History',
       'Horror_Thriller'],
      dtype='object')

In [1220]:
## keep only these genres
genre_keeps = ['Action_Adventure','Animation', 'Comedy', 'SciFi_Fantasy',
               'Drama', 'Family', 'Mystery_Crime','Bio_History',
               'Horror_Thriller', 'Romance']

In [1221]:
genre_dummies = genre_dummies[genre_keeps]

In [1222]:
genres = genres.join(genre_dummies, how='inner')
genres.fillna(0, inplace=True)
genres.drop(['genre_list'], axis=1, inplace=True)
genres.head(2)

Unnamed: 0,title,title_id,Action_Adventure,Animation,Comedy,SciFi_Fantasy,Drama,Family,Mystery_Crime,Bio_History,Horror_Thriller,Romance
0,10 Cloverfield Lane,169,0.0,0,0,1.0,1,0,0.0,0.0,1.0,0
1,102 Not Out,541,0.0,0,1,0.0,1,0,0.0,0.0,0.0,0


In [1223]:
genres.describe()

Unnamed: 0,title_id,Action_Adventure,Animation,Comedy,SciFi_Fantasy,Drama,Family,Mystery_Crime,Bio_History,Horror_Thriller,Romance
count,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0,1321.0
mean,760.87131,0.361847,0.068887,0.336109,0.143073,0.569266,0.038607,0.248297,0.146858,0.259652,0.116578
std,406.878774,0.480717,0.253358,0.472556,0.35028,0.495367,0.19273,0.432188,0.354099,0.43861,0.321039
min,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,762.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1115.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1471.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1224]:
final_meta2 = pd.merge(final_meta, genres)

In [1225]:
final_meta2.drop(['genre','genre_list','genre_count','type'], axis=1, inplace=True)

final_meta2.rename(columns={'contentrating':'content_rating'}, inplace=True)

In [1226]:
print(final_meta2.shape)
final_meta2.head()

(1321, 15)


Unnamed: 0,title,title_id,imdb_id,content_rating,release_date,Action_Adventure,Animation,Comedy,SciFi_Fantasy,Drama,Family,Mystery_Crime,Bio_History,Horror_Thriller,Romance
0,Weapons,1471,tt26581740,R,2025-08-08,0.0,0,0,0.0,0,0,1.0,0.0,1.0,0
1,How to Train Your Dragon,1469,tt26743210,PG,2025-06-13,1.0,0,1,0.0,0,0,0.0,0.0,0.0,0
2,Bring Her Back,1468,tt32246771,R,2025-05-30,0.0,0,0,0.0,0,0,1.0,0.0,1.0,0
3,Lilo & Stitch,1467,tt11655566,PG,2025-05-23,1.0,0,1,0.0,0,0,0.0,0.0,0.0,0
4,Final Destination: Bloodlines,1466,tt9619824,R,2025-05-16,0.0,0,0,0.0,0,0,0.0,0.0,1.0,0


In [1227]:
final_meta2.to_csv('imdb_film_metadata_2025-05-10.csv', index=False)