## IPL Data Analysis

Dataset from https://data.world/raghu543/ipl-data-till-2017

This data set has the ball by ball data of all the Indian Premier League (IPL) matches till 2017 season.


**Note!!!**

Here we will mount the Azure Data Lake Storage (ADLS) Gen2 in Databricks filesystem and access the data files.

In [0]:
application_id = 'c3bc59ef-a4ba-4a46-b547-66862ee4730d'
directory_id = '97443b3e-100a-4ba4-947f-5e78ae387174'

client_secret_id = '823b5971-5050-4585-a0b4-c48f4e84203a'
client_secret = 'Bgo8Q~cd4gs467JXlJqCPfoC8S5S4xlTm.fYCc5B'

oauth2_client_endpoint = "https://login.microsoftonline.com/{}/oauth2/token".format(directory_id)

# Configuration for accessing ADLS Gen2
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": application_id,
    "fs.azure.account.oauth2.client.secret": client_secret,
    "fs.azure.account.oauth2.client.endpoint": oauth2_client_endpoint
}

In [0]:
#Azure Storage Configuration
storage_account_name = 'adslgen2fortrainings'
storage_container_name = 'ipl-data-analysis'

data_source = "abfss://{}@{}.dfs.core.windows.net/".format(storage_container_name, storage_account_name)

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/databricks/mlflow-tracking,databricks/mlflow-tracking,sse-s3
/databricks-results,databricks-results,sse-s3
/databricks/mlflow-registry,databricks/mlflow-registry,sse-s3
/,DatabricksRoot,sse-s3


In [0]:
# Mounting the ADLS Gen2 container
mountPoint = '/mnt/ipl-data-analysis'

if not any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
    dbutils.fs.mount(
        source = data_source,
        mount_point = mountPoint,
        extra_configs = configs
    )

display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/databricks/mlflow-tracking,databricks/mlflow-tracking,sse-s3
/databricks-results,databricks-results,sse-s3
/databricks/mlflow-registry,databricks/mlflow-registry,sse-s3
/mnt/ipl-data-analysis,abfss://ipl-data-analysis@adslgen2fortrainings.dfs.core.windows.net/,
/,DatabricksRoot,sse-s3


In [0]:
display(dbutils.fs.ls('/mnt/ipl-data-analysis'))

path,name,size,modificationTime
dbfs:/mnt/ipl-data-analysis/input/,input/,0,1714850927000
dbfs:/mnt/ipl-data-analysis/output/,output/,0,1714850935000


In [0]:
%fs
ls '/mnt/ipl-data-analysis/input'

path,name,size,modificationTime
dbfs:/mnt/ipl-data-analysis/input/Ball_By_Ball.csv,Ball_By_Ball.csv,25099132,1715079818000
dbfs:/mnt/ipl-data-analysis/input/Match.csv,Match.csv,113358,1715079816000
dbfs:/mnt/ipl-data-analysis/input/Player.csv,Player.csv,34614,1715079816000
dbfs:/mnt/ipl-data-analysis/input/Player_match.csv,Player_match.csv,2664692,1715079817000
dbfs:/mnt/ipl-data-analysis/input/Team.csv,Team.csv,343,1715079816000


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark_context = SparkSession.builder.appName('IPL Data Analysis').getOrCreate()

# Create Spark Dataframe for 'ball_by_ball' dataset
path_ball_by_ball = 'dbfs:/mnt/ipl-data-analysis/input/Ball_By_Ball.csv'

schema_ball_by_ball = StructType([
    StructField("match_id", IntegerType(), False),
    StructField("over_id", IntegerType(), False),
    StructField("ball_id", IntegerType(), False),
    StructField("innings_no", IntegerType(), False),
    StructField("team_batting", StringType(), False),
    StructField("team_bowling", StringType(), False),
    StructField("striker_batting_position", IntegerType(), True),
    StructField("extra_type", StringType(), True),
    StructField("runs_scored", IntegerType(), True),
    StructField("extra_runs", IntegerType(), True),
    StructField("wides", IntegerType(), True),
    StructField("legbyes", IntegerType(), True),
    StructField("byes", IntegerType(), True),
    StructField("noballs", IntegerType(), True),
    StructField("penalty", IntegerType(), True),
    StructField("bowler_extras", IntegerType(), True),
    StructField("out_type", StringType(), True),
    StructField("caught", StringType(), True),
    StructField("bowled", StringType(), True),
    StructField("run_out", StringType(), True),
    StructField("lbw", StringType(), True),
    StructField("retired_hurt", StringType(), True),
    StructField("stumped", StringType(), True),
    StructField("caught_and_bowled", StringType(), True),
    StructField("hit_wicket", StringType(), True),
    StructField("obstructingfeild", StringType(), True),
    StructField("bowler_wicket", StringType(), False),
    StructField("match_date", DateType(), True),
    StructField("season", IntegerType(), True),
    StructField("striker", IntegerType(), True),
    StructField("non_striker", IntegerType(), True),
    StructField("bowler", IntegerType(), True),
    StructField("player_out", IntegerType(), True),
    StructField("fielders", IntegerType(), True),
    StructField("striker_match_sk", IntegerType(), True),
    StructField("strikersk", IntegerType(), True),
    StructField("nonstriker_match_sk", IntegerType(), True),
    StructField("nonstriker_sk", IntegerType(), True),
    StructField("fielder_match_sk", IntegerType(), True),
    StructField("fielder_sk", IntegerType(), True),
    StructField("bowler_match_sk", IntegerType(), True),
    StructField("bowler_sk", IntegerType(), True),
    StructField("playerout_match_sk", IntegerType(), True),
    StructField("battingteam_sk", IntegerType(), True),
    StructField("bowlingteam_sk", IntegerType(), True),
    StructField("keeper_catch", StringType(), True),
    StructField("player_out_sk", IntegerType(), True),
    StructField("matchdatesk", StringType(), True)
])

input_file_date_format = "M/d/yyyy"
input_file_options = {
    'delimiter': ',',
    'header': 'True',
    'inferSchema': 'False',
    'dateFormat': input_file_date_format
}
df_ball_by_ball = spark_context.read \
    .format('csv') \
    .options(**input_file_options) \
    .schema(schema_ball_by_ball) \
    .load(path_ball_by_ball)

df_ball_by_ball = \
    df_ball_by_ball.withColumns({
        'caught': df_ball_by_ball.caught.cast("boolean"),
        'bowled': df_ball_by_ball.bowled.cast("boolean"),
        'run_out': df_ball_by_ball.run_out.cast("boolean"),
        'lbw': df_ball_by_ball.lbw.cast("boolean"),
        'retired_hurt': df_ball_by_ball.retired_hurt.cast("boolean"),
        'stumped': df_ball_by_ball.stumped.cast("boolean"),
        'caught_and_bowled': df_ball_by_ball.caught_and_bowled.cast("boolean"),
        'hit_wicket': df_ball_by_ball.hit_wicket.cast("boolean"),
        'obstructingfeild': df_ball_by_ball.obstructingfeild.cast("boolean"),
        'bowler_wicket': df_ball_by_ball.bowler_wicket.cast("boolean"),
        'keeper_catch': df_ball_by_ball.keeper_catch.cast("boolean")
    })
df_ball_by_ball.show(5)

+--------+-------+-------+----------+------------+------------+------------------------+----------+-----------+----------+-----+-------+----+-------+-------+-------------+--------------+------+------+-------+-----+------------+-------+-----------------+----------+----------------+-------------+----------+------+-------+-----------+------+----------+--------+----------------+---------+-------------------+-------------+----------------+----------+---------------+---------+------------------+--------------+--------------+------------+-------------+-----------+
|match_id|over_id|ball_id|innings_no|team_batting|team_bowling|striker_batting_position|extra_type|runs_scored|extra_runs|wides|legbyes|byes|noballs|penalty|bowler_extras|      out_type|caught|bowled|run_out|  lbw|retired_hurt|stumped|caught_and_bowled|hit_wicket|obstructingfeild|bowler_wicket|match_date|season|striker|non_striker|bowler|player_out|fielders|striker_match_sk|strikersk|nonstriker_match_sk|nonstriker_sk|fielder_match_s

In [0]:
# Unmount only if directory is mounted
# if any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
#   dbutils.fs.unmount(mountPoint)

for mount in dbutils.fs.mounts():
  if (mount.mountPoint == mountPoint):
    dbutils.fs.unmount(mountPoint)
    break

/mnt/ipl-data-analysis has been unmounted.
