# Check Spark UI http://localhost:9090/

# Preprocessing

In [7]:
! ls -al

total 2860
drwxr-xr-x  5 root root    4096 Mar 19 12:42  .
drwxr-xr-x 14 root root    4096 Mar 19 11:40  ..
drwxr-xr-x  2 root root    4096 Mar 19 12:26  .ipynb_checkpoints
-rw-r--r--  1 root root  222666 Mar 17 13:32 'Pyspark Data Pipeline.ipynb'
drwxr-xr-x  3 root root    4096 Mar 15 08:23  PysparkQuakes
-rw-r--r--  1 root root    4005 Mar 15 08:23 'Start Services in Cluster.md'
drwxr-xr-x  2 root root    4096 Mar 15 08:23  codes
-rw-r--r--  1 root root     530 Mar 15 08:23  commands.txt
-rw-r--r--  1 root root   62680 Mar 17 13:30  dashboard.html
-rw-r--r--  1 root root 2397103 Mar 15 08:23  database.csv
-rw-r--r--  1 root root    1975 Mar 19 12:36  extracting.py
-rw-r--r--  1 root root     616 Mar 19 12:09  predict.py
-rw-r--r--  1 root root    1605 Mar 19 12:08  preprocessing-4-ml.py
-rw-r--r--  1 root root    2407 Mar 19 12:19  preprocessing.py
-rw-r--r--  1 root root   72760 Mar 15 08:23  query.csv
-rw-r--r--  1 root root   32991 Mar 19 12:14 'step1_DAG_Pyspark D

In [5]:
%%writefile ../dags/elt_spark.py

import airflow
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from pyspark.sql import SparkSession, functions
from ./preprocessing import *

default_args = {
    'owner': 'jih',
    'start_date': datetime(2020, 11, 18),
    'retries': 10,
    'retry_delay': timedelta(hours=1)
}
with airflow.DAG('dag_etl_spark',
                  default_args=default_args,
                  schedule_interval='0 1 * * *') as dag:
    task_elt_documento_pagar = PythonOperator(
        task_id='elt_spark',
        python_callable=processo_etl_spark
    )

Overwriting ../dags/elt_spark.py


In [11]:
%%writefile ../dags/spark-mongo.py

from airflow import DAG
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from airflow.operators.python_operator import PythonOperator
from datetime import datetime
from pyspark.sql import SparkSession, functions
from pyspark.sql import SparkSession  
from pyspark.sql.types import *
from pyspark.sql.functions import *

import pyspark
    
default_args = {
    'start_date': datetime(2022, 1, 1)
}

with DAG(dag_id='pre_processing',
         schedule_interval="@daily",
         default_args=default_args,
         tags=['spark'],
         catchup=False) as dag:
    # proprocess
    preprocess = PythonOperator(
        task_id="spark_etl",
        python_callable=etl_spark
    )
    
def etl_spark():


    ### Configure spark session
    spark = SparkSession\
       .builder\
       .master('spark://master:7077')\
       .appName('quake_etl')\
       .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
       .getOrCreate()

    #spark.sparkContext._conf.getAll()

    # Load the dataset 
    df_load = spark.read.csv('hdfs://master:9000/database.csv', header=True)

    # Drop fields we don't need from df_load
    lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error',
        'Root Mean Square','Source','Location Source','Magnitude Source','Status']
    df_load = df_load.drop(*lst_dropped_columns)

    # Create a year field and add it to the dataframe
    df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))

    # Build the quakes frequency dataframe using the year field and counts for each year
    df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')

    # Cast some fields from string into numeric types
    df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
        .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
        .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
        .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))

    # Create avg magnitude and max magnitude fields and add to df_quake_freq
    df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude')
    df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude')

    # Join df_max, and df_avg to df_quake_freq
    df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])

    # Remove nulls
    df_load.dropna()
    df_quake_freq.dropna()

    # Build the tables/collections in mongodb
    # Write df_load to mongodb
    df_load.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quakes?authSource=admin').save()

    # Write df_quake_freq to mongodb
    df_quake_freq.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quake_freq?authSource=admin').save()

    spark.stop()


Overwriting ../dags/spark-mongo.py


In [3]:
%%writefile ../dags/spark-mongo.py

from airflow import DAG
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator

from datetime import datetime

default_args = {
    'start_date': datetime(2022, 1, 1)
}

with DAG(dag_id='proprocessing',
         schedule_interval="@daily",
         default_args=default_args,
         tags=['spark'],
         catchup=False) as dag:
    # proprocess
    preprocess = SparkSubmitOperator(
        application="../preprocessing.py",
        task_id="preprocess",
        conn_id="spark_cluster",
    )
    
#     # extracting
#     extract = SparkSubmitOperator(
#         application="../extracting.py",
#         task_id="extract",
#         conn_id="spark_cluster"
#     )    
#     # training
#     train = SparkSubmitOperator(
#         application="../training.py",
#         task_id="train",
#         conn_id="spark_cluster"
#     )
    
#     preprocess >> extract >> extract


def process-etl-spark():
    import pyspark
    from pyspark.sql import SparkSession  
    from pyspark.sql.types import *
    from pyspark.sql.functions import *

    ### Configure spark session
    spark = SparkSession\
       .builder\
       .master('spark://master:7077')\
       .appName('quake_etl')\
       .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
       .getOrCreate()

    #spark.sparkContext._conf.getAll()

    # Load the dataset 
    df_load = spark.read.csv('hdfs://master:9000/database.csv', header=True)

    # Drop fields we don't need from df_load
    lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error',
        'Root Mean Square','Source','Location Source','Magnitude Source','Status']
    df_load = df_load.drop(*lst_dropped_columns)

    # Create a year field and add it to the dataframe
    df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))

    # Build the quakes frequency dataframe using the year field and counts for each year
    df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')

    # Cast some fields from string into numeric types
    df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
        .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
        .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
        .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))

    # Create avg magnitude and max magnitude fields and add to df_quake_freq
    df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude')
    df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude')

    # Join df_max, and df_avg to df_quake_freq
    df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])

    # Remove nulls
    df_load.dropna()
    df_quake_freq.dropna()

    # Build the tables/collections in mongodb
    # Write df_load to mongodb
    df_load.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quakes?authSource=admin').save()

    # Write df_quake_freq to mongodb
    df_quake_freq.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quake_freq?authSource=admin').save()

    spark.stop()


Overwriting ../dags/spark-mongo.py


In [8]:
! python preprocessing.py

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/spark-3.0.3-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-02a6fcc2-6744-4cb2-ae2b-f245e28018cb;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 237ms :: artifacts dl 10ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	----------

In [8]:
%%writefile preprocessing.py
## Load Packages

def process_etl_spark():
    import pyspark
    from pyspark.sql import SparkSession  
    from pyspark.sql.types import *
    from pyspark.sql.functions import *

    ### Configure spark session
    spark = SparkSession\
       .builder\
       .master('spark://master:7077')\
       .appName('quake_etl')\
       .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
       .getOrCreate()

    #spark.sparkContext._conf.getAll()

    # Load the dataset 
    df_load = spark.read.csv('hdfs://master:9000/database.csv', header=True)

    # Drop fields we don't need from df_load
    lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error',
        'Root Mean Square','Source','Location Source','Magnitude Source','Status']
    df_load = df_load.drop(*lst_dropped_columns)

    # Create a year field and add it to the dataframe
    df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))

    # Build the quakes frequency dataframe using the year field and counts for each year
    df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')

    # Cast some fields from string into numeric types
    df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
        .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
        .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
        .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))

    # Create avg magnitude and max magnitude fields and add to df_quake_freq
    df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude')
    df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude')

    # Join df_max, and df_avg to df_quake_freq
    df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])

    # Remove nulls
    df_load.dropna()
    df_quake_freq.dropna()

    # Build the tables/collections in mongodb
    # Write df_load to mongodb
    df_load.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quakes?authSource=admin').save()

    # Write df_quake_freq to mongodb
    df_quake_freq.write.format('mongo')\
        .mode('overwrite')\
        .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quake_freq?authSource=admin').save()

    spark.stop()


Overwriting preprocessing.py


# Machine Learning with Spark

In [11]:
! python extracting.py

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/spark-3.0.3-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-78e041f5-fd80-48ba-8e43-4f83367a9e8f;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 254ms :: artifacts dl 10ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	----------

In [10]:
%%writefile extracting.py

import pyspark
from pyspark.sql import SparkSession  
from pyspark.sql.types import *
from pyspark.sql.functions import *

### Configure spark session
spark = SparkSession\
   .builder\
   .master('spark://master:7077')\
   .appName('quake_etl')\
   .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
   .getOrCreate()

# Load the test data file into a dataframe
df_test = spark.read.csv('hdfs://master:9000/query.csv', header=True)

# Load the training data from mongo into a dataframe
df_train = spark.read.format('mongo')\
    .option('spark.mongodb.input.uri', 'mongodb://root:go2team@mongo:27017/Quake.quakes?authSource=admin').load()

# Select fields we will use and discard fields we don't need
df_test_clean = df_test['time', 'latitude', 'longitude', 'mag', 'depth']

# Rename fields
df_test_clean = df_test_clean.withColumnRenamed('time', 'Date')\
    .withColumnRenamed('latitude', 'Latitude')\
    .withColumnRenamed('longitude', 'Longitude')\
    .withColumnRenamed('mag', 'Magnitude')\
    .withColumnRenamed('depth', 'Depth')

# Cast some string fields into numeric fields
df_test_clean = df_test_clean.withColumn('Latitude', df_test_clean['Latitude'].cast(DoubleType()))\
    .withColumn('Longitude', df_test_clean['Longitude'].cast(DoubleType()))\
    .withColumn('Depth', df_test_clean['Depth'].cast(DoubleType()))\
    .withColumn('Magnitude', df_test_clean['Magnitude'].cast(DoubleType()))

# Create training and testing dataframes
df_testing = df_test_clean['Latitude', 'Longitude', 'Magnitude', 'Depth']
df_training = df_train['Latitude', 'Longitude', 'Magnitude', 'Depth']

# Drop records with null values from our dataframes
df_testing = df_testing.dropna()
df_training = df_training.dropna()

spark.stop()

Overwriting extracting.py


# Training

In [18]:
%%writefile train.py
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Select features to parse into our model and then create the feature vector
assembler = VectorAssembler(inputCols=['Latitude', 'Longitude', 'Depth'], outputCol='features')

# Create the Model
model_reg = RandomForestRegressor(featuresCol='features', labelCol='Magnitude')

# Chain the assembler with the model in a pipeline
pipeline = Pipeline(stages=[assembler, model_reg])

# Train the Model
model = pipeline.fit(df_training)

# Make the prediction
pred_results = model.transform(df_testing)

# Evaluate the model
# rmse should be less than 0.5 for the model to be useful
evaluator = RegressionEvaluator(labelCol='Magnitude', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(pred_results)
print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)

Writing train.py


# Create the prediction dataset

In [19]:
%%writefile predict.py

# Create the prediction dataset
df_pred_results = pred_results['Latitude', 'Longitude', 'prediction']

# Rename the prediction field
df_pred_results = df_pred_results.withColumnRenamed('prediction', 'Pred_Magnitude')

# Add more columns to our prediction dataset
df_pred_results = df_pred_results.withColumn('Year', lit(2017))\
    .withColumn('RMSE', lit(rmse))

# Load the prediction dataset into mongodb
# Write df_pred_results
df_pred_results.write.format('mongo')\
    .mode('overwrite')\
    .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.pred_results?authSource=admin').save()

Writing predict.py


# Data visulization

In [47]:
! python3 -m pip install pandas pymongo bokeh

Collecting bokeh
  Downloading bokeh-2.3.3.tar.gz (10.7 MB)
     |################################| 10.7 MB 7.8 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: bokeh
  Building wheel for bokeh (setup.py) ... [?25ldone
[?25h  Created wheel for bokeh: filename=bokeh-2.3.3-py3-none-any.whl size=11342785 sha256=e8a9257d8bb2c3c397683fed1dffe6e356d22a20253a9cf249c59fb87dfd893d
  Stored in directory: /root/.cache/pip/wheels/8b/59/97/257265b741bab184e0cc8f5676309cb1fe6fbda22011bbb3ff
Successfully built bokeh
Installing collected packages: bokeh
Successfully installed bokeh-2.3.3


In [20]:
%%writefile visualize.py

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.tools import HoverTool
import math
from math import pi
from bokeh.palettes import Category20c
from bokeh.transform import cumsum
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.tile_providers import get_provider, Vendors
from bokeh.themes import built_in_themes
from bokeh.io import curdoc
from pymongo import MongoClient

# Create a custom read function to read data from mongodb into a dataframe
#def read_mongo(host='127.0.0.1', port=27017, username=None, password=None, db='Quake', collection='pred_results'):
def read_mongo(host='mongo', port=27017, username='root', password='go2team', db='Quake', collection='pred_results'):
    
    mongo_uri = 'mongodb://{}:{}@{}:{}/{}.{}?authSource=admin'.format(username, password, host, port, db, collection)
    
    # Connect to mongodb
    conn = MongoClient(mongo_uri)
    db = conn[db]
    
    # Select all records from the collection
    cursor = db[collection].find()
    
    # Create the dataframe
    df = pd.DataFrame(list(cursor))
    
    # Delete the _id field
    del df['_id']
    
    return df


# Load the datasets from mongodb
df_quakes = read_mongo(collection='quakes')
df_quake_freq = read_mongo(collection='quake_freq')
df_quake_pred = read_mongo(collection='pred_results')
df_quakes_2016 = df_quakes[df_quakes['Year'] == 2016]


# Create custom style function to style our plots
def style(p):
    # Title
    p.title.align='center'
    p.title.text_font_size='20pt'
    p.title.text_font='serif'
    
    # Axis titles
    p.xaxis.axis_label_text_font_size='14pt'
    p.xaxis.axis_label_text_font_style='bold'
    p.yaxis.axis_label_text_font_size='14pt'
    p.yaxis.axis_label_text_font_style='bold'
    
    # Tick labels
    p.xaxis.major_label_text_font_size='12pt'
    p.yaxis.major_label_text_font_size='12pt'
    
    # Plot the legend in the top left corner
    p.legend.location='top_left'
    
    return p
    

# Create the Geo Map plot
def plotMap():
    lat = df_quakes_2016['Latitude'].values.tolist()
    lon = df_quakes_2016['Longitude'].values.tolist()
    
    pred_lat = df_quake_pred['Latitude'].values.tolist()
    pred_lon = df_quake_pred['Longitude'].values.tolist()
    
    lst_lat = []
    lst_lon = []
    lst_pred_lat = []
    lst_pred_lon = []
    
    i=0
    j=0
    
    # Convert Lat and Long values into merc_projection format
    for i in range(len(lon)):
        r_major = 6378137.000
        x = r_major * math.radians(lon[i])
        scale = x/lon[i]
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 +
            lat[i] * (math.pi/180.0)/2.0)) * scale
        
        lst_lon.append(x)
        lst_lat.append(y)
        i += 1
        
    # Convert predicted lat and long values into merc_projection format
    for j in range(len(pred_lon)):
        r_major = 6378137.000
        x = r_major * math.radians(pred_lon[j])
        scale = x/pred_lon[j]
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 +
            pred_lat[j] * (math.pi/180.0)/2.0)) * scale
        
        lst_pred_lon.append(x)
        lst_pred_lat.append(y)
        j += 1
    
    
    df_quakes_2016['coords_x'] = lst_lat
    df_quakes_2016['coords_y'] = lst_lon
    df_quake_pred['coords_x'] = lst_pred_lat
    df_quake_pred['coords_y'] = lst_pred_lon
    
    # Scale the circles
    df_quakes_2016['Mag_Size'] = df_quakes_2016['Magnitude'] * 4
    df_quake_pred['Mag_Size'] = df_quake_pred['Pred_Magnitude'] * 4
    
    # create datasources for our ColumnDataSource object
    lats = df_quakes_2016['coords_x'].tolist()
    longs = df_quakes_2016['coords_y'].tolist()
    mags = df_quakes_2016['Magnitude'].tolist()
    years = df_quakes_2016['Year'].tolist()
    mag_size = df_quakes_2016['Mag_Size'].tolist()
    
    pred_lats = df_quake_pred['coords_x'].tolist()
    pred_longs = df_quake_pred['coords_y'].tolist()
    pred_mags = df_quake_pred['Pred_Magnitude'].tolist()
    pred_year = df_quake_pred['Year'].tolist()
    pred_mag_size = df_quake_pred['Mag_Size'].tolist()
    
    # Create column datasource
    cds = ColumnDataSource(
        data=dict(
            lat=lats,
            lon=longs,
            mag=mags,
            year=years,
            mag_s=mag_size
        )
    )
    
    pred_cds = ColumnDataSource(
        data=dict(
            pred_lat=pred_lats,
            pred_long=pred_longs,
            pred_mag=pred_mags,
            year=pred_year,
            pred_mag_s=pred_mag_size
        )
    )
    
    # Tooltips
    TOOLTIPS = [
        ("Year", " @year"),
        ("Magnitude", " @mag"),
        ("Predicted Magnitude", " @pred_mag")
    ]
    
    # Create figure
    p = figure(title='Earthquake Map',
              plot_width=2300, plot_height=450,
              x_range=(-2000000, 6000000),
              y_range=(-1000000, 7000000),
              tooltips=TOOLTIPS)
    
    p.circle(x='lon', y='lat', size='mag_s', fill_color='#cc0000', fill_alpha=0.7,
            source=cds, legend='Quakes 2016')
    
    # Add circles for our predicted earthquakes
    p.circle(x='pred_long', y='pred_lat', size='pred_mag_s', fill_color='#ccff33', fill_alpha=7.0,
            source=pred_cds, legend='Predicted Quakes 2017')
    
    #p.add_tile(CARTODBPOSITRON)
    p.add_tile(get_provider('CARTODBPOSITRON'))
    
    # Style the map plot
    # Title
    p.title.align='center'
    p.title.text_font_size='20pt'
    p.title.text_font='serif'
    
    # Legend
    p.legend.location='bottom_right'
    p.legend.background_fill_color='black'
    p.legend.background_fill_alpha=0.8
    p.legend.click_policy='hide'
    p.legend.label_text_color='white'
    p.xaxis.visible=False
    p.yaxis.visible=False
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color=None
    
       
    # show(p)

    return p
    
#plotMap()  

# Create the Bar Chart
def plotBar():
    # Load the datasource 
    cds = ColumnDataSource(data=dict(
        yrs = df_quake_freq['Year'].values.tolist(),
        numQuakes = df_quake_freq['Counts'].values.tolist()
    ))
    
    # Tooltip
    TOOLTIPS = [
        ('Year', ' @yrs'),
        ('Number of earthquakes', ' @numQuakes')
    ]
    
    # Create a figure
    barChart = figure(title='Frequency of Earthquakes by Year',
                     plot_height=400,
                     plot_width=1150,
                     x_axis_label='Years',
                     y_axis_label='Number of Occurances',
                     x_minor_ticks=2,
                     y_range=(0, df_quake_freq['Counts'].max() + 100),
                     toolbar_location=None,
                     tooltips=TOOLTIPS)
    
    # Create a vertical bar 
    barChart.vbar(x='yrs', bottom=0, top='numQuakes',
                 color='#cc0000', width=0.75,
                 legend='Year', source=cds)
    
    # Style the bar chart
    barChart = style(barChart)
    
    # show(barChart)
    
    return barChart
    
    
# plotBar()
 


df_quake_freq.sort_values(by=['Year'], axis=0, ascending=True, inplace=True)

df_quake_freq.head()

df_quake_freq.reset_index(drop=True)

# Create a magnitude plot
def plotMagnitude():
    # Load the datasource
    cds = ColumnDataSource(data=dict(
        yrs = df_quake_freq['Year'].values.tolist(),
        avg_mag = df_quake_freq['Avg_Magnitude'].round(1).values.tolist(),
        max_mag = df_quake_freq['Max_Magnitude'].values.tolist()
    ))
    
    # Tooltip
    TOOLTIPS = [
        ('Year', ' @yrs'),
        ('Average Magnitude', ' @avg_mag'),
        ('Maximum Magnitude', ' @max_mag')
    ]
    
    # Create the figure
    mp = figure(title='Maximum and Average Magnitude by Year',
               plot_width=1150, plot_height=400,
               x_axis_label='Years',
               y_axis_label='Magnitude',
               x_minor_ticks=2,
               y_range=(5, df_quake_freq['Max_Magnitude'].max() + 1),
               toolbar_location=None,
               tooltips=TOOLTIPS)
    
    # Max Magnitude
    mp.line(x='yrs', y='max_mag', color='#cc0000', line_width=2, legend='Max Magnitude', source=cds)
    mp.circle(x='yrs', y='max_mag', color='#cc0000', size=8, fill_color='#cc0000', source=cds)
    
    # Average Magnitude 
    mp.line(x='yrs', y='avg_mag', color='yellow', line_width=2, legend='Avg Magnitude', source=cds)
    mp.circle(x='yrs', y='avg_mag', color='yellow', size=8, fill_color='yellow', source=cds)
    
    mp = style(mp)
    
    show(mp)
    
    return mp

plotMagnitude()    

# Display the visuals directly in the browser
output_file('dashboard.html')
# Change to a dark theme
curdoc().theme = 'dark_minimal'

import bokeh
print(bokeh.__version__)

# Build the grid plot
from bokeh.layouts import gridplot, grid

# Make the grid
grid = grid( [ plotMap(), [plotBar(), plotMagnitude()]])

# Show the grid
show(grid)

Writing visualize.py
