In [1]:
import requests
import pandas as pd

In [2]:
# Your TMDB API Key
API_KEY = ""

In [3]:
# TMDB URLs
GENRE_LIST_URL = "https://api.themoviedb.org/3/genre/movie/list?api_key={}&language=en-US".format(API_KEY)
DISCOVER_URL = "https://api.themoviedb.org/3/discover/movie?api_key={}".format(API_KEY)

In [None]:
# # check available genres

# params = {
#     'api_key': API_KEY,
#     'language': 'en-US'
# }

# response = requests.get(GENRE_LIST_URL, params=params).json()
# all_genres = {genre['id']: genre['name'] for genre in response['genres']}

# print(all_genres)

In [4]:
# Fetch genre IDs for your desired genres (you can modify this list as needed)
desired_genres = ["Action", "Comedy", "Drama", "Horror", "Thriller", "Adventure", "Romance", "Science Fiction"]
genre_response = requests.get(GENRE_LIST_URL).json()
genre_ids = [genre['id'] for genre in genre_response['genres'] if genre['name'] in desired_genres]



In [5]:
import time

# Construct a genre mapping from ID to name
genre_map = {genre['id']: genre['name'] for genre in genre_response['genres'] if genre['name'] in desired_genres}

# Fetch movies for each genre
movies = []
seen_movie_ids = set()  # Maintain a set of seen movie IDs to avoid duplicates

for genre_id in genre_ids:
    for page in range(1, 300):  # We take 49 pages for each genre
        payload = {
            'with_genres': genre_id,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'page': page
        }
        response = requests.get(DISCOVER_URL, params=payload).json()
        for movie in response['results']:
            # If the movie ID is already seen, skip it
            if movie['id'] in seen_movie_ids:
                continue
            
            # Otherwise, add its ID to the set and append its details to the movies list
            seen_movie_ids.add(movie['id'])
            
            movie_data = (
                movie['title'],
                movie['overview'],  # Extracting the synopsis/overview
                [genre_map[genre] for genre in movie['genre_ids'] if genre in genre_ids]
            )
            movies.append(movie_data)
        
        # Introduce a delay of 2 seconds between consecutive requests
        #time.sleep(2)



In [6]:
print(len(movies))

30505


In [7]:
# # Print the results
# for movie in movies:
#     print(movie[0], "-", movie[1])

In [45]:
movies_df = pd.DataFrame(movies, columns = ['title', 'synopsis','genre'])

In [46]:
movies_df = movies_df[movies_df.synopsis!='']

In [47]:
movies_df.head()

Unnamed: 0,title,synopsis,genre
0,Expend4bles,Armed with every weapon they can get their han...,"[Action, Adventure, Thriller]"
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,"[Action, Thriller]"
2,The Equalizer 3,Robert McCall finds himself at home in Souther...,"[Action, Thriller]"
3,Mortal Kombat Legends: Cage Match,"In 1980s Hollywood, action star Johnny Cage is...",[Action]
4,Desperation Road,"After 11 years in a Mississippi state prison, ...","[Action, Drama, Thriller]"


In [48]:
movies_df.shape

(30329, 3)

In [49]:
movies_df.genre.apply(len)

0        3
1        2
2        2
3        1
4        3
        ..
30500    2
30501    2
30502    2
30503    1
30504    2
Name: genre, Length: 30329, dtype: int64

In [50]:
# check count of movies with single genre assigned
movies_df[movies_df.genre.apply(len)==1].genre.value_counts()

genre
[Drama]              1749
[Comedy]             1594
[Horror]             1571
[Adventure]          1016
[Thriller]            724
[Science Fiction]     615
[Action]              595
[Romance]             468
Name: count, dtype: int64

In [51]:
my_movies = movies_df.copy()

In [52]:
#movies_df.to_csv('movies_df.csv',index=False)

# convert in a format suitable for multilabel classification

In [53]:
my_movies.sample(5)

Unnamed: 0,title,synopsis,genre
5376,Goodnight for Justice: The Measure of a Man,The story of John Goodnight continues. Travel...,[Action]
13516,Latter Days,"Christian, a hunky, 20-something, West Hollywo...","[Drama, Comedy, Romance]"
1126,Hackers,"Along with his new friends, a teenager who was...","[Action, Thriller, Drama]"
8537,The Castle of Fu Manchu,The evil mastermind Fu Manchu plots his latest...,[Adventure]
9350,Samurai Wolf,This is the story of a vagrant samurai – the s...,"[Action, Adventure, Drama]"


In [54]:
my_movies.genre.value_counts()[:20]

genre
[Drama]                      1749
[Drama, Romance]             1626
[Comedy]                     1594
[Horror]                     1571
[Horror, Thriller]           1087
[Adventure]                  1016
[Comedy, Romance]            1007
[Drama, Thriller]             866
[Thriller]                    724
[Action, Thriller]            656
[Comedy, Drama]               629
[Science Fiction]             615
[Action]                      595
[Comedy, Drama, Romance]      534
[Action, Adventure]           526
[Adventure, Comedy]           482
[Horror, Science Fiction]     482
[Romance, Drama]              482
[Romance]                     468
[Action, Science Fiction]     415
Name: count, dtype: int64

In [55]:
all_genres = set(x for sublist in my_movies['genre'].tolist() for x in sublist)

In [56]:
all_genres

{'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Horror',
 'Romance',
 'Science Fiction',
 'Thriller'}

In [57]:
# One-hot encoding
for genre in all_genres:
    my_movies[genre] = my_movies['genre'].apply(lambda x: 1 if genre in x else 0)

In [58]:
# Drop the Movie_genre column if needed
my_movies = my_movies.drop('genre', axis=1)

In [59]:
my_movies.iloc[:,2:].sum()

Adventure           6085
Action              7838
Romance             6286
Thriller            7557
Drama              11413
Horror              6605
Science Fiction     5937
Comedy              9025
dtype: int64

In [60]:
my_movies.head()

Unnamed: 0,title,synopsis,Adventure,Action,Romance,Thriller,Drama,Horror,Science Fiction,Comedy
0,Expend4bles,Armed with every weapon they can get their han...,1,1,0,1,0,0,0,0
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,0,1,0,1,0,0,0,0
2,The Equalizer 3,Robert McCall finds himself at home in Souther...,0,1,0,1,0,0,0,0
3,Mortal Kombat Legends: Cage Match,"In 1980s Hollywood, action star Johnny Cage is...",0,1,0,0,0,0,0,0
4,Desperation Road,"After 11 years in a Mississippi state prison, ...",0,1,0,1,1,0,0,0


In [61]:
my_movies.reset_index(drop=True,inplace=True)

In [62]:
my_movies.to_csv('movies_df.csv',index=False)

In [63]:
#movies_df = movies_df.drop('title',axis=1)

In [64]:
new_df = pd.DataFrame()
new_df['title'] = movies_df['title']
new_df['synopsis'] = movies_df['synopsis']
new_df['labels'] = movies_df.iloc[:, 2:].values.tolist()

In [65]:
new_df.head()

Unnamed: 0,title,synopsis,labels
0,Expend4bles,Armed with every weapon they can get their han...,"[[Action, Adventure, Thriller]]"
1,Mission: Impossible - Dead Reckoning Part One,Ethan Hunt and his IMF team embark on their mo...,"[[Action, Thriller]]"
2,The Equalizer 3,Robert McCall finds himself at home in Souther...,"[[Action, Thriller]]"
3,Mortal Kombat Legends: Cage Match,"In 1980s Hollywood, action star Johnny Cage is...",[[Action]]
4,Desperation Road,"After 11 years in a Mississippi state prison, ...","[[Action, Drama, Thriller]]"


In [66]:
new_df.to_csv('movies.csv',index=False)