In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
# All this setup required to access GCS bucket, is only for local/VM machines only, not for the GCP dataproc clusters
credentials_location = '/home/jagadish/.gc/finaldatazoomcamp.json'
conf = SparkConf() \
        .setMaster("local[*]") \
        .setAppName('generate-stats-local') \
        .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar,./lib/spark-bigquery-latest_2.12.jar") \
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [3]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

23/04/06 14:01:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/06 14:01:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark = SparkSession.builder \
            .config(conf=sc.getConf()) \
            .getOrCreate()

In [5]:
ipl_matches_data_gcs_path = "gs://jagadish_data_lake_datazoomcamp-jagadish-final/IPL_Matches_2008_2022.parquet"
ipl_matches = spark.read.parquet(ipl_matches_data_gcs_path)

                                                                                

In [6]:
ipl_matches.printSchema()

root
 |-- id: long (nullable = true)
 |-- city: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- season: long (nullable = true)
 |-- match_number: string (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- toss_winner: string (nullable = true)
 |-- toss_decision: string (nullable = true)
 |-- superover: string (nullable = true)
 |-- winning_team: string (nullable = true)
 |-- won_by: string (nullable = true)
 |-- margin: long (nullable = true)
 |-- method: string (nullable = true)
 |-- player_of_the_match: string (nullable = true)
 |-- team1_players: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- team2_players: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- umpire1: string (nullable = true)
 |-- umpire2: string (nullable = true)



In [7]:
ipl_ball_by_ball_data_gcs_path = "gs://jagadish_data_lake_datazoomcamp-jagadish-final/IPL_Ball_by_Ball_2008_2022.parquet"
ipl_ball_by_ball = spark.read.parquet(ipl_ball_by_ball_data_gcs_path)


                                                                                

In [8]:
ipl_ball_by_ball.printSchema()

root
 |-- id: long (nullable = true)
 |-- innings: long (nullable = true)
 |-- overs: long (nullable = true)
 |-- ball_number: long (nullable = true)
 |-- batter: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- extra_type: string (nullable = true)
 |-- batsman_run: long (nullable = true)
 |-- extras_run: long (nullable = true)
 |-- total_run: long (nullable = true)
 |-- non_boundary: long (nullable = true)
 |-- is_wicket_delivery: long (nullable = true)
 |-- player_out: string (nullable = true)
 |-- dismissal_type: string (nullable = true)
 |-- fielders_involved: string (nullable = true)
 |-- batting_team: string (nullable = true)



In [11]:
# store matches data into BigQuery table
ipl_matches.write \
    .format("bigquery") \
    .option("project", "datazoomcamp-jagadish-final") \
    .option("writeMethod", "direct") \
    .mode("overwrite") \
    .save("ipl_data.matches")

                                                                                

In [12]:
# store scores data into BigQuery table
ipl_ball_by_ball.write \
    .format("bigquery") \
    .option("project", "datazoomcamp-jagadish-final") \
    .option("writeMethod", "direct") \
    .mode("overwrite") \
    .save("ipl_data.scores")

                                                                                