In [6]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import findspark

findspark.init()

import pyspark # only run after findspark.init()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import * 
from sqlalchemy import create_engine
from sqlalchemy import text

import avg_age

In [11]:
def mysql_engine():
    engine = create_engine('mysql+pymysql://imdb:imdb@localhost:3306/imdb')
    return engine

def get_spark():
    conf = SparkConf().setAppName('jupyter').setMaster('spark://0.0.0.0:7077')\
            .set('spark.jars.packages', 'mysql:mysql-connector-java:8.0.15')\
            .set('spark.executor.memory', '4g')
    
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
        
    return spark

In [20]:
# Init MySQL
engine = mysql_engine()

# Init Spark
spark = get_spark()

url = "jdbc:mysql://localhost:3306/imdb"
properties = {'driver': 'com.mysql.jdbc.Driver', 
                'user': "imdb",
                'password': "imdb"}

name_basics = spark.read.jdbc(url=url, table="name_basics", properties=properties)
title_basics = spark.read.jdbc(url=url, table="title_basics", properties=properties)
title_principals = spark.read.jdbc(url=url, table="title_principals", properties=properties)
title_ratings = spark.read.jdbc(url=url, table="title_ratings", properties=properties)

# Avg age

In [8]:
print("python mysql")
%timeit -n 1 -r 1 avg_age.py_mysql(engine)

print("spark mysql")
%timeit -n 1 -r 1 avg_age.py.spark_mysql(name_basics, title_basics, title_principals)

# Top 10

In [13]:
def top_10_py_mysql(engine):
    
    query = """
       SELECT * FROM (
            SELECT
                ROW_NUMBER() OVER (PARTITION BY titles.titleType, titles.startYear ORDER BY averageRating DESC) AS row_num,
                originalTitle,
                titleType,
                startYear, 
                averageRating
            FROM
                title_basics titles
                    JOIN
                title_ratings ratings ON titles.tconst = ratings.tconst
            WHERE startYear IS NOT NULL
        ) AS ranking
        WHERE ranking.row_num <= 10
        ORDER BY titleType, startYear, averageRating DESC;
    """
    stmt = text(query)
    
    conn = engine.connect()
    
    rs = conn.execute(stmt)
    
    rs.fetchall()
    
    conn.close()
    

In [17]:
%%timeit -n 1 -r 1

top_10_py_mysql(engine)

3.26 s ± 14.1 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [25]:
def spark_mysql(title_basics, title_ratings):
    
    titles = title_basics.select('tconst', 'startYear', 'originalTitle')
    titles = (titles.withColumn('startYear', titles['startYear'].cast(IntegerType()))
                     .where(titles['startYear'].isNotNull()))
    
    ratings = title_ratings.select('tconst', 'averageRating')
    
    result = ratings.join(titles, on=['tconst'])
    
    return result

In [26]:
%%timeit -n 1 -r 1

spark_mysql(title_basics, title_ratings).show()

+------+-------------+---------+--------------------+
|tconst|averageRating|startYear|       originalTitle|
+------+-------------+---------+--------------------+
|   471|          5.2|     1903|   Uncle Tom's Cabin|
|   496|          6.5|     1904|    Railroad Smashup|
|   833|          6.5|     1909|  The Country Doctor|
|  1088|          5.2|     1909|      The Way of Man|
|  2122|          6.2|     1912|The Cry of the Ch...|
|  2659|          5.4|     1913|    Bangville Police|
|  4101|          6.0|     1914|  His New Profession|
|  6357|          6.3|     1916|American Aristocracy|
|  6397|          5.7|     1916|   Balettprimadonnan|
|  7240|          6.7|     1917|Homunculus, 4. Te...|
|  7340|          6.6|     1916|               Shoes|
|  7880|          7.6|     1917|         Easy Street|
|  8086|          5.8|     1917|  Her Torpedoed Love|
|  8389|          5.8|     1917| One Touch of Nature|
|  8638|          3.3|     1917|   The Submarine Eye|
|  9376|          7.3|     1

In [15]:
spark.stop()