In [2]:
# import necessary libraries
import requests
import pandas as pd

# select base url with endpoint discover/movie
url = "https://api.themoviedb.org/3/discover/movie"

# read the API key from the selected untracked file
with open("tmdbapikey.txt", "r") as file:
    api_key = file.read().strip()

# create an empty list for all the movies
all = []

# iterate through multiple pages to gather data
for page in range(1, 500):
    params = {
        'api_key': api_key,
        'include_adult': 'true',
        'include_video': 'false',
        'language': 'en-US',
        'page': page,
        'sort_by': "popularity.desc"
    }
    r = requests.get(url, params = params)
    if r.status_code == 200:
        data = r.json()
        all.extend(data.get("results", []))
    else:
        print(f"Failed to fetch data for page {page}: {r.status_code}")
        break

# make a pandas DataFrame from the list of movies created above
df = pd.DataFrame(all)

# save as a csv
df.to_csv("movie_data.csv", index = False)

# data cleaning

# read the saved movie data
filtered = pd.read_csv("movie_data.csv")

# select columns to keep
keep = ['title', 'adult', 'genre_ids', 'id', 'original_language', 'popularity', 'release_date', 'vote_count', 'vote_average']
filtered = filtered[keep]

# check if genre_ids is already a list and fix if necessary
def parse_genre_ids(x):
    if isinstance(x, str):
        try:
            return eval(x)
        except:
            return []
    elif isinstance(x, list):
        return x
    return []

# convert genre_ids from a string to list of integers
filtered['genre_ids'] = filtered['genre_ids'].apply(parse_genre_ids)

# select url for fetching genre names
base_url_genres = "https://api.themoviedb.org/3/genre/movie/list"

# fetch the genre data
params_genres = {
    "api_key": api_key,
    "language": "en-US"
}
genre_response = requests.get(base_url_genres, params=params_genres)

# check if the genre request was successful
if genre_response.status_code == 200:
    genres_data = genre_response.json()
    genre_mapping = {genre['id']: genre['name'] for genre in genres_data['genres']}
else:
    print(f"Failed to fetch genre data: {genre_response.status_code}")

# convert genre_ids to genre_names
filtered['genre_names'] = filtered['genre_ids'].apply(lambda ids: [genre_mapping.get(genre_id) for genre_id in ids])

# print the first few genre mappings to verify
print(filtered[['title', 'genre_ids', 'genre_names']].head())

# extract the primary genre (first genre in the list)
filtered['primary_genre'] = filtered['genre_names'].apply(lambda x: x[0] if x else None)

# remove genre_ids and genre_names columns before saving
filtered = filtered.drop(['genre_ids', 'genre_names'], axis=1)

# remove rows with any missing values
filtered_cleaned = filtered.dropna()

# Save the cleaned DataFrame to a CSV file
filtered_cleaned.to_csv("cleaned_movies.csv", index=False)

                                    title         genre_ids  \
0                   Venom: The Last Dance     [878, 28, 12]   
1                             Terrifier 3    [27, 53, 9648]   
2  Apocalypse Z: The Beginning of the End      [18, 28, 27]   
3                          The Wild Robot  [16, 878, 10751]   
4                           The Substance     [18, 27, 878]   

                            genre_names  
0  [Science Fiction, Action, Adventure]  
1           [Horror, Thriller, Mystery]  
2               [Drama, Action, Horror]  
3  [Animation, Science Fiction, Family]  
4      [Drama, Horror, Science Fiction]  
