In [6]:
! docker exec master /opt/hadoop/bin/hdfs dfs -ls /database.csv

ls: `/database.csv': No such file or directory


In [8]:
! docker exec airflow-webserver echo $PATH

/opt/conda/envs/pipeline/bin:/opt/conda/condabin:/usr/lib/jvm/java-8-openjdk-amd64/bin:/opt/spark/bin:/opt/spark/sbin:/root/.vscode-server/bin/c722ca6c7eed3d7987c0d5c3df5c45f6b15e77d1/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin


In [2]:
! docker exec master /opt/hadoop/bin/hdfs dfs -ls /database.csv

-rw-r--r--   3 root supergroup    2397103 2022-03-20 14:36 /database.csv


In [8]:
! docker cp ../../4_MongoDB/database.csv master:/root/
! docker exec master /opt/hadoop/bin/hdfs dfs -put /root/database.csv /
! docker exec master /opt/hadoop/bin/hdfs dfs -ls /

Found 6 items
drwxr-xr-x   - root supergroup          0 2022-03-26 04:44 /SparkSubmitOperator
drwxr-xr-x   - root supergroup          0 2022-03-26 04:42 /apps
-rw-r--r--   3 root supergroup    2397103 2022-03-26 06:20 /database.csv
drwxr-xr-x   - root supergroup          0 2022-03-26 04:42 /spark-jars
drwxr-xr-x   - root supergroup          0 2022-03-26 06:12 /spark-logs
-rw-r--r--   3 root supergroup       2445 2022-03-26 06:19 /step1_preprocess.py


In [17]:
%%writefile step1_preprocess.py
# %load 4_MongoDB/step1_preprocess.py
## Load Packages

import pyspark
from pyspark.sql import SparkSession  
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
#from pyspark import SparkConf, SparkContext 
            
### Configure spark session
spark = SparkSession\
   .builder\
   .master('spark://master:7077')\
   .appName('quake_etl')\
   .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
   .config('spark.cores.max', '3')\
   .config('spark.executor.memory', '2g')\
   .config('spark.driver.host', 'airflow-webserver')\
   .getOrCreate()

#spark.sparkContext._conf.getAll()

# Load the dataset 
df_load = spark.read.csv('hdfs://master:9000/database.csv', header=True)

# Drop fields we don't need from df_load
lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error',
    'Root Mean Square','Source','Location Source','Magnitude Source','Status']
df_load = df_load.drop(*lst_dropped_columns)

# Create a year field and add it to the dataframe
df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))

# Build the quakes frequency dataframe using the year field and counts for each year
df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')

# Cast some fields from string into numeric types
df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
    .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
    .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
    .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))

# Create avg magnitude and max magnitude fields and add to df_quake_freq
df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude')
df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude')

# Join df_max, and df_avg to df_quake_freq
df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])

# Remove nulls
df_load.dropna()
df_quake_freq.dropna()

# Build the tables/collections in mongodb
# Write df_load to mongodb
df_load.write.format('mongo')\
    .mode('overwrite')\
    .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quakes?authSource=admin').save()

# Write df_quake_freq to mongodb
df_quake_freq.write.format('mongo')\
    .mode('overwrite')\
    .option('spark.mongodb.output.uri', 'mongodb://root:go2team@mongo:27017/Quake.quake_freq?authSource=admin').save()

spark.stop()


Overwriting step1_preprocess.py


In [26]:
! docker exec master rm /root/step1_preprocess.py

In [27]:
! docker cp step1_preprocess.py master:/root/

In [28]:
! docker exec master /opt/hadoop/bin/hdfs dfs -rm /step1_preprocess.py 
! docker exec master /opt/hadoop/bin/hdfs dfs -put /root/step1_preprocess.py /
! docker exec master /opt/hadoop/bin/hdfs dfs -ls /

Deleted /step1_preprocess.py
Found 7 items
drwxr-xr-x   - root supergroup          0 2022-03-26 09:37 /SparkSubmitOperator
drwxr-xr-x   - root supergroup          0 2022-03-18 01:48 /apps
-rw-r--r--   3 root supergroup    2397103 2022-03-26 09:23 /database.csv
drwxr-xr-x   - root supergroup          0 2022-03-18 01:49 /spark-jars
drwxr-xr-x   - root supergroup          0 2022-03-26 09:33 /spark-logs
-rw-r--r--   3 root supergroup       2815 2022-03-26 12:07 /step1_preprocess.py
drwx-wx-wx   - root supergroup          0 2022-03-18 01:41 /tmp


In [31]:
! docker exec master /opt/hadoop/bin/hdfs dfs -cat /step1_preprocess.py 

# %load 4_MongoDB/step1_preprocess.py
## Load Packages

import pyspark
from pyspark.sql import SparkSession  
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.conf import SparkConf
#from pyspark import SparkConf, SparkContext 
            
### Configure spark session
spark = SparkSession\
   .builder\
   .master('spark://master:7077')\
   .appName('quake_etl')\
   .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
   .config('spark.blockManager.port', '10025')\
   .config('park.driver.blockManager.port', '10026')\
   .config('spark.driver.port', '10027')\
   .config('spark.cores.max', '3')\
   .config('spark.executor.memory', '2g')\
   .config('spark.driver.host', 'airflow-webserver')\
   .getOrCreate()

#spark.sparkContext._conf.getAll()

# Load the dataset 
df_load = spark.read.csv('hdfs://master:9000/database.csv', header=True)

# Drop fields we don't need from df_load
lst_dropped_columns = ['Depth Error', 'Time', 'D

In [20]:
%%writefile ../../dags/dag_basicsparksubmit.py
import airflow
from datetime import timedelta
from airflow import DAG
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'airflow',    
    #'start_date': airflow.utils.dates.days_ago(2),
    # 'end_date': datetime(),
    # 'depends_on_past': False,
    # 'email': ['airflow@example.com'],
    # 'email_on_failure': False,
    #'email_on_retry': False,
    # If a task fails, retry it once after waiting
    # at least 5 minutes
    #'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag_spark = DAG(
        dag_id = "Spark_MongoDB",
        default_args=default_args,
        # schedule_interval='0 0 * * *',
        schedule_interval='@once',
        dagrun_timeout=timedelta(minutes=60),
        description='use case of sparkoperator in airflow',
        start_date = airflow.utils.dates.days_ago(1)
)

spark_submit_local = SparkSubmitOperator(
    application ='hdfs://master:9000/step1_preprocess.py' ,
    conn_id= 'spark_default', 
    task_id='Spark_MongoDB_preprocess_task', 
    dag=dag_spark
    )

spark_submit_local

if __name__ == "__main__":
    dag_spark.cli()

Overwriting ../../dags/dag_basicsparksubmit.py
