# Library Requirements

In [1]:
import pandas as pd
import numpy as np
import json
import requests

from tmdbv3api import TMDb, Movie

# Data Load

## Scrap movie data from wikipedia

In [2]:
year_list = [2018, 2019, 2020, 2021]
movie_df = pd.DataFrame()

# Loop through years and fetch data tables
for year in year_list:
    
    # Requirements check - (Manual check done)
    if (year == 2018 or year == 2019 or year == 2020):
        print(f"File : Loading {year}")
        website_link = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
        for i in range(2, 6):
            movie_df = movie_df.append(pd.read_html(
                website_link, header=0)[i], ignore_index=True)
            print(
                f"File : Loading {year} -> Adding Tables : Shape -> {movie_df.shape} !!!")
        print(f"File : {year} Successfully added to MASTER DATAFRAME")
        print("------------------------------------------------------------")

    else:
        print(f"File : Loading {year}")
        website_link = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
        for i in range(3, 7):
            movie_df = movie_df.append(pd.read_html(
                website_link, header=0)[i], ignore_index=True)
            print(
                f"File : Loading {year} -> Adding Tables : Shape -> {movie_df.shape} !!!")
        print(f"File : {year} Successfully added to MASTER DATAFRAME")
        print("------------------------------------------------------------")
        
# Status Update
print("Master Dataframe -> 'Movie_df' is ready for further processing !!!")

File : Loading 2018
File : Loading 2018 -> Adding Tables : Shape -> (74, 6) !!!
File : Loading 2018 -> Adding Tables : Shape -> (137, 7) !!!
File : Loading 2018 -> Adding Tables : Shape -> (206, 7) !!!
File : Loading 2018 -> Adding Tables : Shape -> (272, 7) !!!
File : 2018 Successfully added to MASTER DATAFRAME
------------------------------------------------------------
File : Loading 2019
File : Loading 2019 -> Adding Tables : Shape -> (321, 7) !!!
File : Loading 2019 -> Adding Tables : Shape -> (388, 7) !!!
File : Loading 2019 -> Adding Tables : Shape -> (444, 7) !!!
File : Loading 2019 -> Adding Tables : Shape -> (514, 7) !!!
File : 2019 Successfully added to MASTER DATAFRAME
------------------------------------------------------------
File : Loading 2020
File : Loading 2020 -> Adding Tables : Shape -> (571, 7) !!!
File : Loading 2020 -> Adding Tables : Shape -> (616, 7) !!!
File : Loading 2020 -> Adding Tables : Shape -> (688, 7) !!!
File : Loading 2020 -> Adding Tables : Shape -

# Data Processing

In [6]:
def get_directors(x):
    """[summary]

    Args:
        x ([type]): [description]

    Returns:
        [type]: [description]
    """
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (director) " in x:
        return x.split(" (director)")[0]
    elif " (directors) " in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (director)")[0]
    else:
        return x.split(" (director/screenplay)")[0]


def get_actors(x):
    """[summary]

    Args:
        x ([type]): [description]

    Returns:
        [type]: [description]
    """
    return x.split(";")[-1].split(',')


def get_genres(x):
    """[summary]

    Args:
        x ([type]): [description]

    Returns:
        [type]: [description]
    """
    genres = []
    tmdb = TMDb()
    movie_TMDB = Movie()
    tmdb.api_key = '---YOUR API KEY---'
    result = movie_TMDB.search(x)
    if not result:
        return np.NaN
    else:
        movie_id = result[0].id
        response = requests.get(
            f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}")
        json_data = response.json()

        if json_data['genres']:
            genre_string = " "
            for i in range(len(json_data['genres'])):
                genres.append(json_data['genres'][i]['name'])
            return genre_string.join(genres)
        else:
            np.NaN


def clean_data(df):
    """[summary]

    Args:
        df ([type]): [description]

    Returns:
        [type]: [description]
    """
    # Drop Unneccessary columns
    df = df.drop(columns=['Opening', 'Opening.1',
                 '.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.', 'Ref.'])

    # get directors name
    df['director_name'] = df['Cast and crew'].map(
        lambda x: get_directors(str(x)))
    print("Director column status : done !!!")

    # get actor names
    actor_df = df['Cast and crew'].map(lambda x: get_actors(x)).apply(pd.Series).iloc[:, :3].rename(
        columns={0: 'actor_1_name', 1: 'actor_2_name', 2: 'actor_3_name'})
    df = df.join(actor_df)
    print("Actor columns status : done !!!")

    # get genres
    df['genres'] = df['Title'].map(lambda x: get_genres(x))
    print("Genres column status : done !!!")

    # drop unnecessary columns
    df = df.drop(columns=['Production company', 'Cast and crew'])

    # change columns names according to master table
    df = df.rename(columns={'Title': 'movie_title'})
    df['movie_title'] = df['movie_title'].str.lower()

    # drop any null values
    df = df.dropna(how='any')

    # stripping all the values to maintain consistancy
    for col in df.columns:
        if not df[col].isnull().all():
            df[col] = df[col].str.strip()

    # Combination of columns
    df['comb'] = df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + \
        df['actor_3_name'] + ' ' + df['director_name'] + ' ' + df['genres']

    print("!!!-----Successfully cleaned dataframe------!!!")

    return df

In [7]:
# Create Cleaned dataframe -> Higher Time Consumption
cleaned_df = clean_data(movie_df)

Director column status : done !!!
Actor columns status : done !!!
Genres column status : done !!!
!!!-----Successfully cleaned dataframe------!!!


# Data Export

In [110]:
cleaned_df.to_csv("MyData/movie_data_18192021.csv", index=False)