# Data Preparation and Cleaning

### Essential Libraries

In [15]:
# Basic Libraries
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

### API Key

In [16]:
API_KEY = '7884759e98354ed855458b12c5c2bcdc'

### Helper Functions

In [17]:
def get_id():
    """return a dataframe with movie ids"""

    # Get total pages of Discover section
    resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=popularity.desc&include_adult=True&include_video=false&page=1")
    total_pages = resp.json()['total_pages']

    # Iterating through DB pages and appending id to movie DF
    id = []

    for i in tqdm(range(1, total_pages + 1)):  # need change 2 to total pages + 1
        resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=revenue.desc&include_adult=True&include_video=false&page={i}")
        
        # Select the relevant columns
        try:
            results = resp.json()['results']
            for item in results:
                id.append(item['id'])
        except:
            continue
    
    movie = pd.DataFrame(id, columns=['id'])
    
    # movie = movie.reset_index(drop=True) # idk what this do
    return movie

def get_basic_info(movie_id):
    """
    return of list of following movie info:
    1. title
    2. adult
    3. popularity
    4. budget
    5. revenue
    6. vote_count
    7. vote_average
    8. release_date
    9. release_year
    10. original language
    11. genres
    """

    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&language=en-US")

    basic_info = []
    
    movie = resp.json()
    
    basic_info.append(movie['title'])
    basic_info.append(movie['adult'])
    basic_info.append(movie['popularity'])
    basic_info.append(movie['budget'])
    basic_info.append(movie['revenue'])
    basic_info.append(movie['vote_count'])
    basic_info.append(movie['vote_average'])
    basic_info.append(movie['release_date'])
    basic_info.append(movie['release_date'][0:4])
    basic_info.append(movie['original_language'])
        
        # movie genres
    genres = []
    for genre in movie['genres']:
        genres.append(genre['name'])
    basic_info.append(genres)

    # missing values
    for i in range(len(basic_info)):
        if basic_info[i] == '':
            basic_info[i] = np.nan
    return basic_info

def get_casts(movie_id):
    """returns list of casts from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={API_KEY}&language=en-US")
    casts_dict = resp.json()['cast']
    
    casts = []
    
    for item in casts_dict:
        casts.append(item['name'])
    
    if casts:
        return casts
    else:
        return np.nan

def get_keywords(movie_id):
    """returns list of keywords from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={API_KEY}")
    keywords_dict = resp.json()['keywords']

    keywords = []

    for item in keywords_dict:
        keywords.append(item['name'])
    
    if keywords:
        return keywords
    else:
        return np.nan

def get_recommendations(movie_id):
    """returns list of recommended movies from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/recommendations?api_key={API_KEY}&language=en-US&page=1")
    
    recommendations = []
    
    recommendations_dict = resp.json()['results']

    for item in recommendations_dict:
        recommendations.append(item['title'])

    if recommendations:
        return recommendations
    else:
        return np.nan

def get_similar_movies(movie_id):
    """returns list of similar movies from movie id"""
    resp = r.get(f"https://api.themoviedb.org/3/movie/{movie_id}/similar?api_key={API_KEY}&language=en-US&page=1")
    
    similar_movies = []
    
    similar_movies_dict = resp.json()['results']

    for item in similar_movies_dict:
        similar_movies.append(item['title'])

    if similar_movies:
        return similar_movies
    else:
        return np.nan
    

### Retrieve movie data from TMDB

In [18]:
# movie ids
def get_movie_data():
    movie_df = get_id()

    cols = ['title', 'adult', 'popularity', 'budget', 'revenue', 'vote_count', 'vote_average','release_date', 'release_year', 'original_language', 'genre', 'casts', 'keywords', 'recommendations', 'similar_movies']

    rows = []

    # add relevant information to df for each movie
    for i in tqdm(range(len(movie_df))):  # change to length of movie_df
        movie_id = movie_df['id'][i]
        
        row = []

        # title, adult, popularity, vote_count, vote_avg, release_date, release_yaer, orig_lang, genre
        basic_info = get_basic_info(movie_id)
        row += basic_info

        # movie cast members
        casts = get_casts(movie_id)
        row.append(casts)
        
        # movie keywords
        keywords = get_keywords(movie_id)
        row.append(keywords)

        # movie recommendations
        recommendations = get_recommendations(movie_id)
        row.append(recommendations)

        # similar movies
        similar_movies = get_similar_movies(movie_id)
        row.append(similar_movies)

        rows.append(row)


    stats_df = pd.DataFrame(rows, columns=cols)
    df = pd.concat([movie_df, stats_df], axis = 1)

    return df

### Data cleaning

In [19]:
raw_movie_df = get_movie_data()
cleaned_df = raw_movie_df.copy()

cleaned_df = cleaned_df.dropna()
cleaned_df['release_date'] = pd.to_datetime(cleaned_df['release_date'])
cleaned_df['release_year'] = cleaned_df['release_year'].astype('int64')

cleaned_df.head()

100%|██████████| 41091/41091 [3:16:47<00:00,  3.48it/s]  
100%|██████████| 10000/10000 [4:06:32<00:00,  1.48s/it] 


Unnamed: 0,id,title,adult,popularity,budget,revenue,vote_count,vote_average,release_date,release_year,original_language,genre,casts,keywords,recommendations,similar_movies
1,19995,Avatar,False,432.199,237000000,2920357254,28759,7.569,2009-12-15,2009,en,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldaña, Sigourney Weave...","[culture clash, future, space war, space colon...","[Capturing Avatar, Avatar: Creating the World ...","[The Reckless Hour, MicroPlanet 3D, If I Were ..."
2,299534,Avengers: Endgame,False,152.323,356000000,2799439100,22908,8.267,2019-04-24,2019,en,"[Adventure, Science Fiction, Action]","[Robert Downey Jr., Chris Evans, Mark Ruffalo,...","[space travel, time travel, time machine, sequ...","[Avengers: Infinity War, Captain Marvel, Spide...","[Blankman, Santos, The Masters of Time, Scott ..."
4,76600,Avatar: The Way of Water,False,10255.685,460000000,2309660236,6285,7.74,2022-12-14,2022,en,"[Science Fiction, Adventure, Action]","[Sam Worthington, Zoe Saldaña, Sigourney Weave...","[loss of loved one, dying and death, alien lif...","[Capturing Avatar, Avatar: Creating the World ...","[Cyber Ninja, Spenser: A Savage Place, Broken ..."
5,597,Titanic,False,137.265,200000000,2187463944,22650,7.892,1997-11-18,1997,en,"[Drama, Romance]","[Leonardo DiCaprio, Kate Winslet, Billy Zane, ...","[drowning, evacuation, shipwreck, iceberg, for...","[The Lion King, Pirates of the Caribbean: The ...","[The Reckless Hour, The Naked Flame, Bell Witc..."
6,140607,Star Wars: The Force Awakens,False,62.371,245000000,2068223624,17860,7.302,2015-12-15,2015,en,"[Adventure, Action, Science Fiction, Fantasy]","[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...","[android, spacecraft, space opera]","[Star Wars: The Last Jedi, Star Wars: Episode ...","[MicroPlanet 3D, Geography of the Universe, Su..."


In [20]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7486 entries, 1 to 9997
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 7486 non-null   int64         
 1   title              7486 non-null   object        
 2   adult              7486 non-null   bool          
 3   popularity         7486 non-null   float64       
 4   budget             7486 non-null   int64         
 5   revenue            7486 non-null   int64         
 6   vote_count         7486 non-null   int64         
 7   vote_average       7486 non-null   float64       
 8   release_date       7486 non-null   datetime64[ns]
 9   release_year       7486 non-null   int64         
 10  original_language  7486 non-null   object        
 11  genre              7486 non-null   object        
 12  casts              7486 non-null   object        
 13  keywords           7486 non-null   object        
 14  recommen

### Extracting clean dataset to CSV file

In [21]:
cleaned_df.to_csv('cleaned-movie-dataset.csv')