In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from google.cloud import bigquery


In [8]:
spark = SparkSession.builder.appName('geoEDA')\
    .config('spark.jars, "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.23.2.jar"')\
    .getOrCreate()

#conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

spark.sparkContext.getConf().getAll()

[('spark.stage.maxConsecutiveAttempts', '10'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.submit.pyFiles',
  '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-

In [9]:
df_2020 = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2020", inferSchema=True, header=True)
df_2020.printSchema()




root
 |-- Trip ID: string (nullable = true)
 |-- Trip Start Timestamp: string (nullable = true)
 |-- Trip End Timestamp: string (nullable = true)
 |-- Trip Seconds: integer (nullable = true)
 |-- Trip Miles: double (nullable = true)
 |-- Pickup Census Tract: long (nullable = true)
 |-- Dropoff Census Tract: long (nullable = true)
 |-- Pickup Community Area: integer (nullable = true)
 |-- Dropoff Community Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- Additional Charges: double (nullable = true)
 |-- Trip Total: double (nullable = true)
 |-- Shared Trip Authorized: boolean (nullable = true)
 |-- Trips Pooled: integer (nullable = true)
 |-- Pickup Centroid Latitude: double (nullable = true)
 |-- Pickup Centroid Longitude: double (nullable = true)
 |-- Pickup Centroid Location: string (nullable = true)
 |-- Dropoff Centroid Latitude: double (nullable = true)
 |-- Dropoff Centroid Longitude: double (nullable = true)
 |-- Dropof

                                                                                

In [None]:
#df_weather = spark.read.csv("gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv", inferSchema=True, header=True)
#df_weather.printSchema()
#df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], "yyyy-MM-dd"))


In [10]:
# Working with just data that contains full information and check for dupes
df_2020 = df_2020.dropna(how='any', subset=['Trip Start Timestamp','Trip End Timestamp','Fare','Dropoff Community Area','Pickup Community Area'])
df_2020 = df_2020.dropDuplicates()
#df_2020.count()

In [11]:
# Drop columns unlikely to be useful for analysis for speed of computation and rename columns to remove spacing for ease of code writing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

df_2020 = df_2020.drop('Trips Pooled','Additional Charges','Shared Trip Authorized','Pickup Centroid Location','Dropoff Centroid Location')
df_2020 = df_2020.withColumnRenamed("Trip ID","ID").withColumnRenamed("Trip Start Timestamp","start_timestamp").withColumnRenamed("Trip End Timestamp","end_timestamp").withColumnRenamed("Trip Miles",\
                "miles").withColumnRenamed("Pickup Census Tract","pickup_tract").withColumnRenamed("Dropoff Census Tract","dropoff_tract").withColumnRenamed("Pickup Community Area","pickup_area"\
                ).withColumnRenamed("Dropoff Community Area","dropoff_area").withColumnRenamed("Trip Total","total").withColumnRenamed("Pickup Centroid Latitude","pickup_lat").withColumnRenamed(\
                "Pickup Centroid Longitude","pickup_lon").withColumnRenamed("Pickup Centroid Location","pickup_location").withColumnRenamed("Dropoff Centroid Latitude","dropoff_lat").withColumnRenamed(\
                "Dropoff Centroid Longitude","dropoff_lon").withColumnRenamed("Dropoff Centroid Location","dropoff_location")
# fix datatypes
df_2020 = df_2020.withColumn('start_timestamp', F.to_timestamp(df_2020['start_timestamp'], 'MM/dd/yyyy hh:mm:ss a')).withColumn('end_timestamp', F.to_timestamp(df_2020['end_timestamp'], 'MM/dd/yyyy hh:mm:ss a'))

In [8]:
df_2020.show(10)

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-------------------+------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+
|                  ID|    start_timestamp|      end_timestamp|Trip Seconds|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total|   pickup_lat|    pickup_lon|  dropoff_lat|   dropoff_lon|
+--------------------+-------------------+-------------------+------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+
|69eea42563294c995...|2020-01-16 07:00:00|2020-01-16 07:30:00|        1579|  9.0| 17031010502|  17031081401|          1|           8|15.0|  0|19.83|42.0045174875|-87.6633278588|41.8950334495|-87.6197106717|
|7d8ebe4a54afaa50a...|2020-01-16 07:00:00|2020-01-16 07:45:00|        2081|  7.3|        null|         null|         58|          28|10.0|  0|12.48|41.8173662079|-87.698860

                                                                                

In [14]:
!scala --version

bad option: '--version'

Usage: scala <options> [<script|class|object|jar> <arguments>]
   or  scala -help

All options to scalac (see scalac -help) are also allowed.



In [15]:
df_2020 \
.write \
.mode('overwrite') \
.format('csv') \
.save('gs://msca-bdp-student-gcs/bdp-rideshare-project/clean/clean2020.csv')

                                                                                

In [13]:
gcs_bucket = 'msca-bdp-student-gcs'

# Update to your BigQuery dataset name you created
bq_dataset = 'chicago_rideshare'

# Enter BigQuery table name you want to create or overwite. 
# If the table does not exist it will be created when you run the write function
bq_table = 'clean2020'

df_2020.write.format("bigquery") \
    .mode('overwrite')\
    .option("table", "msca-bdp-student-ap.chicago_rideshare.clean2020") \
    .option("temporaryGcsBucket", gcs_bucket) \
    .save()

Py4JJavaError: An error occurred while calling o589.save.
: java.lang.ClassNotFoundException: Failed to find data source: bigquery. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:692)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:746)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:993)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:311)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:301)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ClassNotFoundException: bigquery.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:666)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:666)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:666)
	... 15 more


In [32]:
from pyspark.conf import SparkConf

spark_conf = SparkConf().setAll([
    ("spark.driver.extraClassPath", "/path/to/bigquery/jar"),
    ("spark.executor.extraClassPath", "/path/to/bigquery/jar"),
    ("google.cloud.auth.service.account.enable", "true"),
    ("google.cloud.auth.service.account.json.keyfile", "/path/to/keyfile.json"),
    ("spark.sql.execution.arrow.enabled", "true")
])

spark = SparkSession.builder \
    .appName("database-read-write") \
    .config(conf=spark_conf) \
    .getOrCreate()

project_id = "<project_id>"
dataset_name = "<dataset_name>"
table_name = "<table_name>"

df = spark.read \
    .format("bigquery") \
    .option("table", f"{project_id}:{dataset_name}.{table_name}") \
    .load()

df.write \
    .format("bigquery") \
    .option("table", f"{project_id}:{dataset_name}.{table_name}") \
    .mode("overwrite") \
    .save()


23/11/14 23:22:23 WARN org.apache.spark.sql.internal.SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


Py4JJavaError: An error occurred while calling o853.load.
: java.lang.ClassNotFoundException: Failed to find data source: bigquery. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:692)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:746)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:265)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:225)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ClassNotFoundException: bigquery.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:666)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:666)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:666)
	... 14 more


In [18]:
import os

# Import library to enable some interactive widgets
import ipywidgets as widgets
# Import options from itables separately
#import itables.options as itable_opts
# Import basic Python data science libraries
import numpy as np
import pandas as pd
# Import plotly express for quick plots
#import plotly.express as px
# Import AI Platform library (as Vertex AI) and BigQuery library
#from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery
# Import Exceptions library to help with error catching
from google.cloud.exceptions import BadRequest
# Import IPython display utilities
from IPython.display import clear_output
# Import particular interactive capabilities
from ipywidgets import interact
# Import itables for interactive tables
#from itables import show

In [38]:
PROJECT_ID='MSCA BDP Student - AP'

In [20]:
bq_client = bigquery.Client(project='msca-bdp-student-ap')


In [59]:
dataset_id = 'geodata'
table_id = 'community'

In [21]:
uri = 'gs://msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/geojson/boundaries_community_areas_current.geojson'  

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    autodetect=True
)

#dataset_ref = msca-bdp-student-ap.dataset('geodata')
#table_ref = dataset_ref.table('community')

load_job = bq_client.load_table_from_uri(
    uri, 'msca-bdp-student-ap.chicago_rideshare.community', job_config=job_config)

load_job.result()  


BadRequest: 400 Error while reading data, error message: Failed to parse JSON: Unexpected end of string; Unexpected end of string; Expected key File: boundaries_community_areas_current.geojson

In [None]:
uri = 'gs://msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/geojson/boundaries_community_areas_current.geojson'  
df = spark.read.format("json").load(uri)


In [23]:
!bq load \
 --source_format=NEWLINE_DELIMITED_JSON \
 --json_extension=GEOJSON \
 --autodetect \
 chicago_rideshare.community \
 gs://msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/geojson/boundaries_community_areas_current.geojson

Waiting on bqjob_r25f2847a430f8596_0000018bd052f1bf_1 ... (0s) Current status: DONE   
BigQuery error in load operation: Error processing job 'msca-bdp-student-
ap:bqjob_r25f2847a430f8596_0000018bd052f1bf_1': Error while reading data, error
message: Failed to parse JSON: Unexpected end of string; Unexpected end of
string; Expected key File: boundaries_community_areas_current.geojson
