In [1]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import findspark

findspark.init()

In [3]:
import pyspark # only run after findspark.init()

from sqlalchemy import text

import myutils

from pyspark import SparkConf
from pyspark.sql import SparkSession
from sqlalchemy import create_engine

def mysql_engine():
    engine = create_engine('mysql+pymysql://imdb:imdb@localhost:3306/imdb')
    return engine

def get_spark():
    conf = SparkConf().setAppName('jupyter').setMaster('spark://0.0.0.0:7077')\
            .set('spark.jars.packages', 'mysql:mysql-connector-java:8.0.15')
    
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
        
    return spark

spark = get_spark()

In [6]:
def avg_age_direct(engine):
    
    query = """
        SELECT 
            title.startYear,
            principal.category,
            AVG(title.startYear - person.birthYear) AS avg_age
        FROM
            imdb.title_principals principal
                JOIN
            imdb.title_basics title ON title.tconst = principal.tconst
                JOIN
            imdb.name_basics person ON principal.nconst = person.nconst
        WHERE
            person.birthYear IS NOT NULL
            AND title.startYear IS NOT NULL
            AND principal.category in ('actor', 'actress')
        GROUP BY title.startYear, principal.category
        ORDER BY title.startYear, principal.category ;
    """
    stmt = text(query)
    
    conn = engine.connect()
    
    rs = conn.execute(stmt)
    
    rs.fetchall()
    
    conn.close()
    

In [7]:
%%timeit -n 1 -r 1

engine = mysql_engine()

avg_age_direct(engine)

1min 18s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
url = "jdbc:mysql://localhost:3306/imdb"
properties = {'driver': 'com.mysql.jdbc.Driver', 
                'user': "imdb",
                'password': "imdb"}

name_basics = spark.read.jdbc(url=url, table="name_basics", properties=properties)
title_basics = spark.read.jdbc(url=url, table="title_basics", properties=properties)
title_principals = spark.read.jdbc(url=url, table="title_principals", properties=properties)

In [14]:
title_basics.dtypes

[('tconst', 'int'),
 ('titleType', 'string'),
 ('primaryTitle', 'string'),
 ('originalTitle', 'string'),
 ('isAdult', 'string'),
 ('startYear', 'string'),
 ('endYear', 'string'),
 ('runtimeMinutes', 'string'),
 ('genres', 'string')]

In [15]:
from pyspark.sql.functions import *
from pyspark.sql.types import * 

def spark_mysql_avg_age(name_basics, title_basics, title_principals):
    name = name_basics.select('nconst', 'birthYear')
    
    title = title_basics.select('tconst', 'startYear')
    title = title.withColumn('startYear', title['startYear'].cast(IntegerType()))

In [9]:
title.show()

<SparkContext master=spark://0.0.0.0:7077 appName=hello>


<SparkContext master=spark://0.0.0.0:7077 appName=hello>
