In [25]:
import requests
import pandas as pd

url = "https://api.themoviedb.org/3/discover/movie"

with open("tmdbapikey.txt", "r") as file:
    api_key = file.read().strip()

all = []

for page in range(1, 500):
    params = {
        'api_key': api_key,
        'include_adult': 'true',
        'include_video': 'false',
        'language': 'en-US',
        'page': page,
        'sort_by': "popularity.desc"
    }
    r = requests.get(url, params = params)
    if r.status_code == 200:
        data = r.json()
        all.extend(data.get("results", []))
    else:
        print(f"Failed to fetch data for page {page}: {r.status_code}")
        break

df = pd.DataFrame(all)

df.to_csv("movie_data.csv", index = False)

# Read the saved movie data
filtered = pd.read_csv("movie_data.csv")

# Columns to keep
keep = ['title', 'adult', 'genre_ids', 'id', 'original_language', 'popularity', 'release_date', 'vote_count', 'vote_average']
filtered = filtered[keep]

# Check if genre_ids is already a list and fix if necessary
def parse_genre_ids(x):
    if isinstance(x, str):
        try:
            # If x is a string representation of a list, convert it into an actual list
            return eval(x)
        except:
            return []
    elif isinstance(x, list):
        return x
    return []

# Convert genre_ids from string to list of integers
filtered['genre_ids'] = filtered['genre_ids'].apply(parse_genre_ids)

# API URL for genres
base_url_genres = "https://api.themoviedb.org/3/genre/movie/list"

# Fetch the genre data from the API
params_genres = {
    "api_key": api_key,
    "language": "en-US"
}
genre_response = requests.get(base_url_genres, params=params_genres)

# Check if the genre request was successful
if genre_response.status_code == 200:
    genres_data = genre_response.json()
    genre_mapping = {genre['id']: genre['name'] for genre in genres_data['genres']}
else:
    print(f"Failed to fetch genre data: {genre_response.status_code}")

# Convert genre IDs to genre names
filtered['genre_names'] = filtered['genre_ids'].apply(lambda ids: [genre_mapping.get(genre_id) for genre_id in ids])

# Debug: Print the first few genre mappings to verify
print(filtered[['title', 'genre_ids', 'genre_names']].head())

# Extract the primary genre (first genre in the list)
filtered['primary_genre'] = filtered['genre_names'].apply(lambda x: x[0] if x else None)

# Remove genre_ids and genre_names columns before saving
filtered = filtered.drop(['genre_ids', 'genre_names'], axis=1)

# Save the cleaned data to a CSV file
filtered.to_csv("cleaned_movies.csv", index=False)

unique_genres = set(filtered['primary_genre'].dropna())
print("Unique Genres:", unique_genres)

                                    title         genre_ids  \
0                   Venom: The Last Dance     [878, 28, 12]   
1                             Terrifier 3    [27, 53, 9648]   
2  Apocalypse Z: The Beginning of the End      [18, 28, 27]   
3                          The Wild Robot  [16, 878, 10751]   
4                           The Substance     [18, 27, 878]   

                            genre_names  
0  [Science Fiction, Action, Adventure]  
1           [Horror, Thriller, Mystery]  
2               [Drama, Action, Horror]  
3  [Animation, Science Fiction, Family]  
4      [Drama, Horror, Science Fiction]  
Unique Genres: {'Horror', 'Thriller', 'Animation', 'TV Movie', 'War', 'Mystery', 'Science Fiction', 'Crime', 'Action', 'Family', 'Adventure', 'Western', 'History', 'Fantasy', 'Comedy', 'Music', 'Documentary', 'Romance', 'Drama'}
