In [2]:
import pyspark as ps    # for the pyspark suite


In [3]:
spark = (ps.sql.SparkSession
         .builder
         .master('local[4]')
         .appName('lecture')
         .getOrCreate()
        )
sc = spark.sparkContext

In [4]:
spark

In [5]:
df_movie_raw = spark.read.csv('data/title.basics.tsv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep="\t",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?

In [6]:
df_movie_raw.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [7]:
df_movie_raw.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [8]:
df_movie_raw.createOrReplaceTempView('Movies')


In [10]:
spark.sql("Desc Movies").show()

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|        tconst|   string|   null|
|     titleType|   string|   null|
|  primaryTitle|   string|   null|
| originalTitle|   string|   null|
|       isAdult|      int|   null|
|     startYear|   string|   null|
|       endYear|   string|   null|
|runtimeMinutes|   string|   null|
|        genres|   string|   null|
+--------------+---------+-------+



In [12]:
spark.sql("Select distinct(m.titleType) from Movies m").show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
+------------+



In [13]:
df_ratings_raw = spark.read.csv('data/title.ratings.tsv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep="\t",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?

In [15]:
df_ratings_raw.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



In [16]:
df_ratings_raw.createOrReplaceTempView('Ratings')

In [17]:
spark.sql("Desc Ratings").show()

+-------------+---------+-------+
|     col_name|data_type|comment|
+-------------+---------+-------+
|       tconst|   string|   null|
|averageRating|   double|   null|
|     numVotes|      int|   null|
+-------------+---------+-------+



In [18]:
spark.sql("Select * from Ratings").show(10)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.6|    1593|
|tt0000002|          6.0|     195|
|tt0000003|          6.5|    1266|
|tt0000004|          6.1|     121|
|tt0000005|          6.1|    2029|
|tt0000006|          5.1|     111|
|tt0000007|          5.4|     632|
|tt0000008|          5.4|    1741|
|tt0000009|          5.3|      89|
|tt0000010|          6.9|    5746|
+---------+-------------+--------+
only showing top 10 rows



In [21]:
df_principal_crew_raw = spark.read.csv('data/title.principals.tsv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep="\t",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?

In [22]:
df_principal_crew_raw.show(5)

+---------+--------+---------+---------------+--------------------+----------+
|   tconst|ordering|   nconst|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                  \N|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|        \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|        \N|
|tt0000002|       1|nm0721526|       director|                  \N|        \N|
|tt0000002|       2|nm1335271|       composer|                  \N|        \N|
+---------+--------+---------+---------------+--------------------+----------+
only showing top 5 rows



In [23]:
df_principal_crew_raw.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: integer (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [24]:
df_principal_crew_raw.createOrReplaceTempView('Principal_Crew')

In [25]:
spark.sql("Desc Principal_Crew").show()

+----------+---------+-------+
|  col_name|data_type|comment|
+----------+---------+-------+
|    tconst|   string|   null|
|  ordering|      int|   null|
|    nconst|   string|   null|
|  category|   string|   null|
|       job|   string|   null|
|characters|   string|   null|
+----------+---------+-------+



In [29]:
spark.sql("Select distinct(category) from Principal_Crew").show()

+-------------------+
|           category|
+-------------------+
|            actress|
|           producer|
|             writer|
|           composer|
|           director|
|               self|
|              actor|
|             editor|
|    cinematographer|
|      archive_sound|
|production_designer|
|    archive_footage|
+-------------------+



In [30]:
spark.sql("select * from Principal_Crew where tconst = 'tt9916880'").show()

+---------+--------+----------+--------+------------------+--------------------+
|   tconst|ordering|    nconst|category|               job|          characters|
+---------+--------+----------+--------+------------------+--------------------+
|tt9916880|      10| nm2676923| actress|                \N|["Sour Susan","Go...|
|tt9916880|       1| nm1483166|   actor|                \N|["Rude Ralph","Mi...|
|tt9916880|       2| nm0254176| actress|                \N|  ["Moody Margaret"]|
|tt9916880|       3| nm0286175|   actor|                \N|["Dad","Aerobic A...|
|tt9916880|       4|nm10535738| actress|                \N|    ["Horrid Henry"]|
|tt9916880|       5| nm0996406|director|principal director|                  \N|
|tt9916880|       6| nm1482639|  writer|                \N|                  \N|
|tt9916880|       7| nm2586970|  writer|             books|                  \N|
|tt9916880|       8| nm1594058|producer|          producer|                  \N|
|tt9916880|       9| nm10525

In [35]:
spark.sql("Select count(distinct(job)) from Principal_Crew where category = 'writer'").show(50, False)

+-------------------+
|count(DISTINCT job)|
+-------------------+
|30171              |
+-------------------+



In [36]:
spark.sql("Select distinct(job) from Principal_Crew where category = 'writer'").show(200, False)

+-------------------------------------------------------------------------------+
|job                                                                            |
+-------------------------------------------------------------------------------+
|story "Raggedy Ann"                                                            |
|play "Three Bears"                                                             |
|story "Blossom"                                                                |
|story "Danger"                                                                 |
|play "Beverly's Balance"                                                       |
|story "The Wilderness Trail"                                                   |
|play "Guilty"                                                                  |
|story "The Worst Woman in Hollywood"                                           |
|gags                                                                           |
|play "The Criti

In [37]:
df_crew_names = spark.read.csv('data/name.basics.tsv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep="\t",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?

In [39]:
df_crew_names.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- birthYear: string (nullable = true)
 |-- deathYear: string (nullable = true)
 |-- primaryProfession: string (nullable = true)
 |-- knownForTitles: string (nullable = true)



In [40]:
df_crew_names.createOrReplaceTempView('Crew_names')


In [42]:
spark.sql("Desc Crew_names").show()

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|           nconst|   string|   null|
|      primaryName|   string|   null|
|        birthYear|   string|   null|
|        deathYear|   string|   null|
|primaryProfession|   string|   null|
|   knownForTitles|   string|   null|
+-----------------+---------+-------+

