In [1]:
import os
import configparser
import numpy as np
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek

In [2]:
config = configparser.ConfigParser()
config.read('aws.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config.get('KEYS', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = config.get('KEYS', 'AWS_SECRET_ACCESS_KEY')
input_dir = config.get('FILES', 'INPUT_DIR')
output_dir = config.get('FILES', 'OUTPUT_DIR')

imdb_input = input_dir + "/imdb"
tmdb_input = input_dir + "/tmdb"

tmdb_input

's3a://udacity-dend-imdb-project/input_data/tmdb'

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [4]:
#Basic title

imdb_titles_df = spark.read.option("header", "true").csv(imdb_input + '/title.basics.tsv.gz', sep='\t')
imdb_titles_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [10]:
imdb_titles_df.select('titleType').distinct().collect()

[Row(titleType='tvSeries'),
 Row(titleType='tvMiniSeries'),
 Row(titleType='tvMovie'),
 Row(titleType='tvPilot'),
 Row(titleType='tvEpisode'),
 Row(titleType='movie'),
 Row(titleType='tvSpecial'),
 Row(titleType='video'),
 Row(titleType='videoGame'),
 Row(titleType='tvShort'),
 Row(titleType='short')]

In [5]:
imdb_movie_titles_df = imdb_titles_df.select(
    col("tconst").alias("title_id"), 
    col("titleType").alias("title_type"),
    col("primaryTitle").alias("primary_title"),
    col("originalTitle").alias("original_title"),
    col("isAdult").alias("is_adult"),
    col("startYear").alias("start_year"),
    col("endYear").alias("end_year")
) \
.filter((imdb_titles_df.startYear == '2022') & (imdb_titles_df.titleType == 'movie'))

imdb_movie_titles_df.printSchema()

root
 |-- title_id: string (nullable = true)
 |-- title_type: string (nullable = true)
 |-- primary_title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- is_adult: string (nullable = true)
 |-- start_year: string (nullable = true)
 |-- end_year: string (nullable = true)



In [7]:
imdb_titles_df.write.partitionBy("startYear").format("parquet").save(output_dir + '/imdb_titles', mode="overwrite")

In [6]:
#Title akas
imdb_title_akas_df = spark.read.option("header", "true").csv(imdb_input + '/title.akas.tsv.gz', sep='\t')
imdb_title_akas_df.printSchema()

root
 |-- titleId: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- title: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- types: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- isOriginalTitle: string (nullable = true)



In [7]:
tmdb_movies_df = spark.read.json(tmdb_input+'/movies/movie_100.json', multiLine=True)
tmdb_movies_df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- logo_path: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- origin_country: string (nullable = true)
 |-- production_countries

In [8]:
#IMDB movies_details query
movies_details = imdb_titles_df.join(imdb_title_akas_df, imdb_titles_df.tconst == imdb_title_akas_df.titleId) \
.join(tmdb_movies_df, imdb_titles_df.tconst == tmdb_movies_df.imdb_id, "left") \
.select(
    col("titleId").alias("imdb_title_id"), 
    imdb_title_akas_df.title,
    col("originalTitle").alias("original_title"),
    imdb_titles_df.genres,
    "language",
    "region",
    tmdb_movies_df.overview,
    col("startYear").alias("start_year"), 
    col("runtimeMinutes").alias("runtime"), 
    tmdb_movies_df.release_date
) \
.filter((imdb_titles_df.titleType == 'movie') & (imdb_titles_df.startYear == '2022'))

movies_details.printSchema()

root
 |-- imdb_title_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- language: string (nullable = true)
 |-- region: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- start_year: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- release_date: string (nullable = true)



In [11]:
movies_details.count()

25170

In [9]:
# movies_details table parquet
movies_details.write.partitionBy("language", "start_year").format("parquet").save(output_dir + '/movies_details', mode="overwrite")

In [6]:
#IMDB ratings DF
imdb_ratings_df = spark.read.option("header", "true").csv(imdb_input + '/title.ratings.tsv.gz', sep='\t')
imdb_ratings_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [10]:
movies_details = spark.read.parquet(output_dir + '/movies_details')

In [None]:
#IMDB Ratings query
movies_ratings = imdb_ratings_df.join(movies_details, imdb_ratings_df.tconst == movies_details.m) \
.join(tmdb_movies_df,imdb_ratings_df.tconst == tmdb_movies_df.imdb_id, "left") \
.select("imdb_title_id", "title", "language", "start_year",
    col("numVotes").alias("imdb_total_votes"), 
    col("averageRating").alias("imdb_avg_rating")),
    col("vote_count").alias("tmdb_total_votes"),
    col("vote_average").alias("tmdb_avg_rating"), 
.filter(movies_details.startYear == '2022')

movies_ratings.printSchema()

In [21]:
#IMDB Ratings table write parquet
movies_ratings.write.partitionBy("language", "startYear").format("parquet").save(imdb_output + '/movies_ratings', mode="overwrite")

In [7]:
imdb_names_df = spark.read.option("header", "true").csv(imdb_input + '/name.basics.tsv.gz', sep='\t')
imdb_names_df.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- birthYear: string (nullable = true)
 |-- deathYear: string (nullable = true)
 |-- primaryProfession: string (nullable = true)
 |-- knownForTitles: string (nullable = true)



In [15]:
moviesTitleIds = movies_details.select("imdb_title_id").rdd.map(lambda row: row(0)).collect()

In [25]:
sampleIds=moviesTitleIds[0:2]

In [7]:
imdb_names_df.filter('birthYear < 1922').count()

97834

In [17]:
imdb_principals_df = spark.read.option("header", "true").csv(imdb_input + '/title.principals.tsv.gz', sep='\t')
imdb_principals_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [16]:
imdb_principals_df.limit(10).collect()

[Row(tconst='tt0000001', ordering='1', nconst='nm1588970', category='self', job='\\N', characters='["Self"]'),
 Row(tconst='tt0000001', ordering='2', nconst='nm0005690', category='director', job='\\N', characters='\\N'),
 Row(tconst='tt0000001', ordering='3', nconst='nm0374658', category='cinematographer', job='director of photography', characters='\\N'),
 Row(tconst='tt0000002', ordering='1', nconst='nm0721526', category='director', job='\\N', characters='\\N'),
 Row(tconst='tt0000002', ordering='2', nconst='nm1335271', category='composer', job='\\N', characters='\\N'),
 Row(tconst='tt0000003', ordering='1', nconst='nm0721526', category='director', job='\\N', characters='\\N'),
 Row(tconst='tt0000003', ordering='2', nconst='nm1770680', category='producer', job='producer', characters='\\N'),
 Row(tconst='tt0000003', ordering='3', nconst='nm1335271', category='composer', job='\\N', characters='\\N'),
 Row(tconst='tt0000003', ordering='4', nconst='nm5442200', category='editor', job='\\N'

In [21]:
sampleIds = ['tt0000001', 'tt0000002', 'tt0000003']

In [26]:
imdb_principal_crew = imdb_principals_df.select(
    col("tconst").alias("title_id"),
    "ordering",
    col("nconst").alias("name_id"),
    "category",
    "job",
    "characters"
).filter(imdb_principals_df['tconst'].isin(sampleIds))

Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.functions.lit.
: java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [0]
	at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:78)
	at org.apache.spark.sql.catalyst.expressions.Literal$$anonfun$create$2.apply(literals.scala:164)
	at org.apache.spark.sql.catalyst.expressions.Literal$$anonfun$create$2.apply(literals.scala:164)
	at scala.util.Try.getOrElse(Try.scala:79)
	at org.apache.spark.sql.catalyst.expressions.Literal$.create(literals.scala:163)
	at org.apache.spark.sql.functions$.typedLit(functions.scala:127)
	at org.apache.spark.sql.functions$.lit(functions.scala:110)
	at org.apache.spark.sql.functions.lit(functions.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [23]:
imdb_principal_crew.show()

+---------+--------+---------+---------------+--------------------+----------+
| title_id|ordering|  name_id|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                  \N|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|        \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|        \N|
|tt0000002|       1|nm0721526|       director|                  \N|        \N|
|tt0000002|       2|nm1335271|       composer|                  \N|        \N|
|tt0000003|       1|nm0721526|       director|                  \N|        \N|
|tt0000003|       2|nm1770680|       producer|            producer|        \N|
|tt0000003|       3|nm1335271|       composer|                  \N|        \N|
|tt0000003|       4|nm5442200|         editor|                  \N|        \N|
+---------+--------+---------+---------------+------

In [None]:
imdb_principal_crew.write.partitionBy("title_id").format("parquet").save(output_dir + '/movies_principal_crew', mode="overwrite")

In [7]:
#Movie finance
movies_finances = tmdb_movies_df.join(movies_details, tmdb_movies_df.imdb_id == movies_details.titleId) \
.select(col("imdb_id"), movies_details.title, movies_details.language, movies_details.startYear, col("revenue"), col("budget")) \
.filter(movies_details.startYear == '2022')

movies_finances.printSchema()
movies_finances.write.partitionBy("language", "startYear").format("parquet").save(output_dir + '/movies_finances', mode="overwrite")

root
 |-- imdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- revenue: long (nullable = true)
 |-- budget: long (nullable = true)

