In [1]:
# For Google Colaboratory
!pip install pyspark py4j
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Basics").getOrCreate()



In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/MyDrive/Big_Data/Practicals' # Please adjust the path accordingly
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/MyDrive/Big_Data/Practicals


## Practical 3: Spark Streaming

This notebook provides a structure streaming example using Spark.

Source: https://github.com/databricks/Spark-The-Definitive-Guide

In [4]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [5]:
static = spark.read.json("file:/content/gdrive/MyDrive/Big_Data/Practicals/activity-data/")
dataSchema = static.schema

In [6]:
dataSchema

StructType([StructField('Arrival_Time', LongType(), True), StructField('Creation_Time', LongType(), True), StructField('Device', StringType(), True), StructField('Index', LongType(), True), StructField('Model', StringType(), True), StructField('User', StringType(), True), StructField('gt', StringType(), True), StructField('x', DoubleType(), True), StructField('y', DoubleType(), True), StructField('z', DoubleType(), True)])

In [7]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
  .json("file:/content/gdrive/MyDrive/Big_Data/Practicals/activity-data/")


In [8]:
activityCounts = streaming.groupBy("gt").count()


In [9]:
activityQuery = activityCounts.writeStream.queryName("activity_counts")\
  .format("memory").outputMode("complete")\
  .start()

In [14]:
activityQuery.lastProgress

{'id': '8ecf970d-ef9a-4301-868d-4eefc6998ffb',
 'runId': 'a2caf030-ae39-4876-b4ad-5ef93aa77877',
 'name': 'activity_counts',
 'timestamp': '2025-09-02T09:03:20.438Z',
 'batchId': 8,
 'numInputRows': 78012,
 'inputRowsPerSecond': 40631.25,
 'processedRowsPerSecond': 36488.306828811976,
 'durationMs': {'addBatch': 1765,
  'commitOffsets': 81,
  'getBatch': 32,
  'latestOffset': 108,
  'queryPlanning': 86,
  'triggerExecution': 2138,
  'walCommit': 63},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 7,
   'numRowsUpdated': 7,
   'allUpdatesTimeMs': 336,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 569,
   'memoryUsedBytes': 4280,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 5,
   'numStateStoreInstances': 5,
   'customMetrics': {'loadedMapCacheHitCount': 80,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 1864}}],
 'sources': [{'description': 'FileStreamSource[file:/content/gdrive/MyDrive/Big_Data

In [13]:
activityQuery.status

{'message': 'Getting offsets from FileStreamSource[file:/content/gdrive/MyDrive/Big_Data/Practicals/activity-data]',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [16]:
from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)


+----------+------+
|        gt| count|
+----------+------+
|       sit|270775|
|     stand|250477|
|stairsdown|205983|
|      walk|291631|
|  stairsup|230004|
|      null|229846|
|      bike|237543|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|283083|
|     stand|261861|
|stairsdown|215343|
|      walk|304887|
|  stairsup|240465|
|      null|240292|
|      bike|248340|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|295391|
|     stand|273245|
|stairsdown|224704|
|      walk|318143|
|  stairsup|250926|
|      null|250737|
|      bike|259138|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|307699|
|     stand|284629|
|stairsdown|234064|
|      walk|331399|
|  stairsup|261387|
|      null|261182|
|      bike|269935|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|       sit|320007|
|     stand|296013|
|stairsdown|2434