In [1]:
import ast
import pandas as pd

In [2]:
DATA_DIRECTORY = "../data/raw/"
TO_DIRECTORY = "../data/processed/"

In [3]:
def show(data):
    print(data.shape)
    display(data.head(3))

In [4]:
genres = pd.read_csv(DATA_DIRECTORY + "tmdb/genres.csv")
show(genres)

genres.to_parquet(TO_DIRECTORY + "genres.parquet")

(19, 2)


Unnamed: 0,id,name
0,28,Action
1,12,Adventure
2,16,Animation


In [5]:
providers = pd.read_csv(DATA_DIRECTORY + "tmdb/providers.csv")
show(providers)

providers.to_parquet(TO_DIRECTORY + "providers.parquet")

(62, 3)


Unnamed: 0,logo_path,provider_name,provider_id
0,/7rwgEs15tFwyR9NPQ5vpzxTj19Q.jpg,Disney Plus,337
1,/emthp39XA2YScoYL1p0sdbAH2WA.jpg,Amazon Prime Video,119
2,/t2yyOv40HZeVlLjYsCsPHnWLk4W.jpg,Netflix,8


In [6]:
top_rated_movies = pd.read_csv(DATA_DIRECTORY + "tmdb/top_rated_movies.csv")
show(top_rated_movies)

(9025, 14)


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/rSPw7tgCH9c6NqICZef4kZjFOQ5.jpg,"[18, 80]",238,en,The Godfather,"Em 1945, Don Corleone é o chefe de uma mafiosa...",130.08,/oJagOzBu9Rdd9BrciseCm3U3MCU.jpg,1972-07-07,O Poderoso Chefão,False,8.708,19178
1,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,"[18, 80]",278,en,The Shawshank Redemption,"Em 1946, Andy Dufresne, um banqueiro jovem e b...",117.083,/umX3lBhHoTV7Lsci140Yr8VpXyN.jpg,1995-03-17,Um Sonho de Liberdade,False,8.707,25162
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,"Após a máfia matar sua família, o jovem Vito f...",74.196,/7g6wvsWHxBQujUcSXvZLhdFpDUy.jpg,1975-02-14,O Poderoso Chefão: Parte II,False,8.589,11562


In [7]:
additional_info = pd.read_csv(DATA_DIRECTORY + "tmdb/additional_info.csv")
show(additional_info)

(9025, 7)


Unnamed: 0,id,budget,revenue,imdb_id,runtime,tagline,production_countries
0,238,6000000,245066411,tt0068646,175,An offer you can't refuse.,US
1,278,25000000,28341469,tt0111161,142,Fear can hold you prisoner. Hope can set you f...,US
2,240,13000000,102600000,tt0071562,202,,US


In [8]:
omdb_data = pd.read_csv(DATA_DIRECTORY + "omdb/data.csv")
show(omdb_data)

(7815, 26)


Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,Time
0,The Immigrant,2013,R,23 May 2014,120 min,"Drama, Romance",James Gray,"James Gray, Ric Menello","Marion Cotillard, Joaquin Phoenix, Jeremy Renner",1921. In search of a new start and the America...,...,6.6,33296,tt1951181,movie,20 Feb 2015,"$2,025,328",,,True,2023-12-25 10:00:28.706861
1,Happy Death Day,2017,PG-13,13 Oct 2017,96 min,"Comedy, Horror, Mystery",Christopher Landon,Scott Lobdell,"Jessica Rothe, Israel Broussard, Ruby Modine","A teenage girl, trying to enjoy her birthday, ...",...,6.6,158561,tt5308322,movie,02 Jan 2018,"$55,683,845",,,True,2023-12-25 10:00:29.119556
2,Persona,1966,Not Rated,16 Mar 1967,83 min,"Drama, Thriller",Ingmar Bergman,Ingmar Bergman,"Bibi Andersson, Liv Ullmann, Margaretha Krook","A young nurse, Alma, is put in charge of Elisa...",...,8.1,128047,tt0060827,movie,11 Mar 2017,,,,True,2023-12-25 10:00:29.519696


In [9]:
movies_providers = pd.read_csv(DATA_DIRECTORY + "tmdb/movie_providers.csv")
show(movies_providers)

(34403, 4)


Unnamed: 0,id,link,transaction_type,provider_id
0,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,167
1,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,2
2,238,https://www.themoviedb.org/movie/238-the-godfa...,rent,10


In [10]:
top_rated_movies["year"] = top_rated_movies["release_date"].str[:4]
top_rated_movies["genre_ids"] = top_rated_movies["genre_ids"].apply(
    lambda x: ", ".join(map(str, ast.literal_eval(x)))
)

In [11]:
movies_providers_pivoted = pd.pivot_table(
    movies_providers[["id", "transaction_type", "provider_id"]].astype(str),
    values="provider_id",
    index="id",
    columns="transaction_type",
    aggfunc=lambda x: ", ".join(x),
).reset_index()
movies_providers_pivoted.columns.name = None
movies_providers_processed = movies_providers_pivoted[["id", "buy", "flatrate", "free", "rent"]].copy()
movies_providers_processed["id"] = movies_providers_processed["id"].astype(int)

In [12]:
data = (
    top_rated_movies[["id", "genre_ids", "title", "overview", "poster_path", "year"]]
    .merge(additional_info[["id", "imdb_id", "runtime"]], how="left", on="id")
    .merge(omdb_data[["imdbID", "Actors"]], how="left", left_on="imdb_id", right_on="imdbID")
    .merge(movies_providers[["id", "link"]].drop_duplicates(), how="left", on="id")
    .merge(movies_providers_processed, how="left", on="id")
)

data = data.drop(["imdb_id", "imdbID"], axis=1)
data = data.rename({"Actors": "actors"}, axis=1)

show(data)

data.to_parquet(TO_DIRECTORY + "app_data.parquet")

(9025, 13)


Unnamed: 0,id,genre_ids,title,overview,poster_path,year,runtime,actors,link,buy,flatrate,free,rent
0,238,"18, 80",O Poderoso Chefão,"Em 1945, Don Corleone é o chefe de uma mafiosa...",/oJagOzBu9Rdd9BrciseCm3U3MCU.jpg,1972,175,"Marlon Brando, Al Pacino, James Caan",https://www.themoviedb.org/movie/238-the-godfa...,"2, 10, 3, 68","619, 531, 499, 1853",,"167, 2, 10, 3, 68"
1,278,"18, 80",Um Sonho de Liberdade,"Em 1946, Andy Dufresne, um banqueiro jovem e b...",/umX3lBhHoTV7Lsci140Yr8VpXyN.jpg,1995,142,"Tim Robbins, Morgan Freeman, Bob Gunton",https://www.themoviedb.org/movie/278-the-shaws...,"2, 10, 3","384, 484",,"2, 10, 3"
2,240,"18, 80",O Poderoso Chefão: Parte II,"Após a máfia matar sua família, o jovem Vito f...",/7g6wvsWHxBQujUcSXvZLhdFpDUy.jpg,1975,202,"Al Pacino, Robert De Niro, Robert Duvall",https://www.themoviedb.org/movie/240-the-godfa...,"2, 10, 3, 68","619, 307",,"167, 2, 10, 3, 68"
