<a href="https://colab.research.google.com/github/jggomez/spark-demo/blob/main/Apache_Spark_neo4j_bigquery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from datetime import datetime

In [None]:
spark_session = SparkSession \
  .builder \
  .appName("reading_neo4j_contents_1") \
  .config("spark.jars", "/content/neo4j-connector-apache-spark_2.12-4.1.5_for_spark_3.jar, /content/spark-3.3-bigquery-0.29.0-preview.jar") \
  .getOrCreate()

spark_session.conf.set("credentialsFile", "/content/wordboxdev-bigquery.json")
spark_session.conf.get("spark.jars")

'/content/neo4j-connector-apache-spark_2.12-4.1.5_for_spark_3.jar, /content/spark-3.3-bigquery-0.29.0-preview.jar'

# Extract

In [None]:
series_contents = spark_session.read.format("org.neo4j.spark.DataSource") \
  .option("url", "u") \
  .option("authentication.type", "basic") \
  .option("authentication.basic.username", "neo4j") \
  .option("authentication.basic.password", "pass") \
  .option("query", """MATCH (series:SERIES)
                      WITH series
                      LIMIT 2
                      MATCH (series) - [:HAS_CONTENT] -> (content:CONTENT)
                      RETURN series.id as serie_id,
                              content.id as content_id,
                              content.duration as duration,
                              content.n_words as num_words
                      """) \
  .load()

series_contents

DataFrame[serie_id: string, content_id: string, duration: double, num_words: bigint]

# Transform

In [None]:
series_contents.createOrReplaceTempView("SeriesContent")
output =  spark_session.sql('SELECT Count(serie_id) FROM SeriesContent GROUP BY serie_id')
output.show()

+---------------+
|count(serie_id)|
+---------------+
|         107530|
|          74216|
+---------------+



In [None]:
contents_count = series_contents.count()
contents_count

181746

In [None]:
series_contents_count = series_contents.groupBy("serie_id").count()
series_contents_count.show()
series_count = series_contents_count.count()
series_count

+--------------------+------+
|            serie_id| count|
+--------------------+------+
|6e5a96db-9637-11e...|107530|
|6e5a9e2e-9637-11e...| 74216|
+--------------------+------+



2

# Load - **(L)**

In [None]:
today = datetime.now().strftime("%m_%d_%y_%H_%M_%S")
today

'08_26_23_06_20_51'

In [None]:
fields_scheme_basic_data = [
    StructField("series_count", LongType(), True),
    StructField("content_count", LongType(), True)]

scheme_basic_data = StructType(fields_scheme_basic_data)

basic_data_counts = spark_session.createDataFrame([], scheme_basic_data)
basic_data_counts

DataFrame[series_count: bigint, content_count: bigint]

In [None]:
table_name_contents_basic_data = f"wordboxdev.contents.contents_basic_data_{today}"
basic_data_counts = basic_data_counts.union(spark_session.createDataFrame([(series_count, contents_count)], scheme_basic_data))

In [None]:
basic_data_counts.write.format('bigquery') \
  .option("writeMethod", "direct") \
  .option('table', table_name_contents_basic_data) \
  .save()

In [None]:
fields_scheme_serie_content = [
    StructField("serie_id", StringType(), True),
    StructField("content_count", LongType(), True)]

scheme_serie_content = StructType(fields_scheme_serie_content)
serie_content_count_insert = spark_session.createDataFrame([], scheme_serie_content)

In [None]:
serie_content_count_insert = serie_content_count_insert.union(series_contents_count)
serie_content_count_insert.show()

+--------------------+-------------+
|            serie_id|content_count|
+--------------------+-------------+
|6e5a96db-9637-11e...|       107530|
|6e5a9e2e-9637-11e...|        74216|
+--------------------+-------------+



In [None]:
table_name = f"wordboxdev.contents.serie_content_count_{today}"
series_contents_count.write.format('bigquery') \
  .option("writeMethod", "direct") \
  .option('table', table_name) \
  .save()