In [1]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload
%load_ext autoreload
%autoreload 2

In [None]:
import findspark

findspark.init()

import pyspark # only run after findspark.init()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import * 
from sqlalchemy import create_engine
from sqlalchemy import text

import myutils

In [19]:
def mysql_engine():
    engine = create_engine('mysql+pymysql://imdb:imdb@localhost:3306/imdb')
    return engine

def get_spark():
    conf = SparkConf().setAppName('jupyter').setMaster('spark://0.0.0.0:7077')\
            .set('spark.jars.packages', 'mysql:mysql-connector-java:8.0.15')\
            .set('spark.executor.memory', '4g')
    
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
        
    return spark

spark = get_spark()

url = "jdbc:mysql://localhost:3306/imdb"
properties = {'driver': 'com.mysql.jdbc.Driver', 
                'user': "imdb",
                'password': "imdb"}

name_basics = spark.read.jdbc(url=url, table="name_basics", properties=properties)
title_basics = spark.read.jdbc(url=url, table="title_basics", properties=properties)
title_principals = spark.read.jdbc(url=url, table="title_principals", properties=properties)

In [7]:
def avg_age_direct(engine):
    
    query = """
        SELECT 
            title.startYear,
            principal.category,
            AVG(title.startYear - person.birthYear) AS avg_age
        FROM
            imdb.title_principals principal
                JOIN
            imdb.title_basics title ON title.tconst = principal.tconst
                JOIN
            imdb.name_basics person ON principal.nconst = person.nconst
        WHERE
            person.birthYear IS NOT NULL
            AND title.startYear IS NOT NULL
            AND principal.category in ('actor', 'actress')
        GROUP BY title.startYear, principal.category
        ORDER BY title.startYear, principal.category ;
    """
    stmt = text(query)
    
    conn = engine.connect()
    
    rs = conn.execute(stmt)
    
    rs.fetchall()
    
    conn.close()
    

In [8]:
engine = mysql_engine()

In [9]:
%%timeit -n 3 -r 2

avg_age_direct(engine)

1min 15s ± 4.62 s per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [20]:
def spark_mysql_avg_age():
    names = name_basics.select('nconst', 'birthYear')
    names = names.where(names['birthYear'].isNotNull())
    
    titles = title_basics.select('tconst', 'startYear')
    titles = (titles.withColumn('startYear', titles['startYear'].cast(IntegerType()))
                     .where(titles['startYear'].isNotNull()))
    
    principals = title_principals.select('tconst', 'category', 'nconst')
    principals = (principals.where(principals['category'].isin(['actor', 'actress'])))
    
    result = principals.join(titles, on=['tconst']).join(names, on=['nconst'])
    
    result = result.withColumn('age', result['startYear'] - result['birthYear'])
    
    result = result.groupBy(['startYear', 'category']).avg('age')
    result = result.orderBy(['startYear', 'category'])
    
    return result

In [21]:
%%timeit -n 3 -r 2

spark_mysql_avg_age().collect()

44.2 s ± 3.2 s per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [15]:
spark.stop()