#  DSCI 417 â€“ Homework 08
**Lauren Forti**

In [0]:
# setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

spark = SparkSession.builder.getOrCreate()

## Problem 1: Creating Streaming DataFrame

In [0]:
# read file into static df
ps = (
    spark.read
    .option('delimiter', ',')
    .option('header', True)
    .option('inferSchema', True)
    .csv('/FileStore/tables/paysim/step_001.csv')
)

# store schema
ps_schema = ps.schema

# show df
ps.show(5, truncate=False)

In [0]:
# read file into streaming df
paysim_stream = (
  spark.readStream
  .option('header', True)
  .option('delimiter', ',')
  .option('maxFilesPerTrigger', 1)
  .schema(ps_schema)
  .csv('/FileStore/tables/paysim/')
)

print(paysim_stream.isStreaming)

## Problem 2: Apply Transformations

In [0]:
type_summary = (
  paysim_stream
  
  # group by type
  .groupBy('type')
  
  .agg(
    # count # of records
    expr('count(*) as n'),
    # get avg amount
    expr('round(mean(amount), 4) as avg_amount'),
    # get min amount
    expr('min(amount) as min_amount'),
    # get max amount
    expr('max(amount) as max_amount')
  )
  
  # sort by # of records desc
  .sort('n', ascending=False)
)

In [0]:
destinations = (
  paysim_stream
  
  # keep only transfer records
  .filter(col('type') == 'TRANSFER')
  
  # group by destination
  .groupBy('nameDest')
  .agg(
    expr('count(*) as n'),
    expr('sum(amount) as total_amount'),
    expr('round(mean(amount), 2) as avg_amount')
  )
  
  # sort by # of records desc
  .sort('n', ascending=False)
)

## Problem 3: Define Output Sinks

In [0]:
# creating data stream writer objects
t_writer = (
    type_summary
    .writeStream
    .format('memory')
    .queryName('type_summary')
    .outputMode('complete')
)

d_writer = (
    destinations
    .writeStream
    .format('memory')
    .queryName('destinations')
    .outputMode('complete')
)

## Problem 4: Start and Monitor the Streams

In [0]:
# start streams
t_query = t_writer.start()
d_query = d_writer.start()

In [0]:
# put contents of type_summary into static df
type_df = spark.sql('SELECT * from type_summary').toPandas()

# count # of observations
t_n = type_df.count()
# output
print(t_n)

# display df
type_df.display()

type,n,avg_amount,min_amount,max_amount
CASH_OUT,250071,186314.0387,0.37,10000000.0
PAYMENT,234839,11587.4637,0.1,115264.68
CASH_IN,152356,171838.341,5.44,1289407.91
TRANSFER,57588,673235.3867,2.6,10000000.0
DEBIT,4971,5982.907,0.87,150053.28


In [0]:
# put contents of type_summary into static df
dest_df = spark.sql('SELECT * from destinations').toPandas()

# count # of observations
d_n = dest_df.count()
# output
print(d_n)

# display first 16 rows of df
dest_df.head(16)

Unnamed: 0,nameDest,n,total_amount,avg_amount
0,C665576141,26,26150103.9,1005773.23
1,C1286084959,26,17588259.1,676471.5
2,C248609774,24,24711862.8,1029660.95
3,C1360767589,22,21756362.92,988925.59
4,C306206744,22,13048885.07,593131.14
5,C985934102,21,13709050.24,652811.92
6,C1590550415,20,26350395.92,1317519.8
7,C451111351,20,19783480.34,989174.02
8,C97730845,20,28009878.86,1400493.94
9,C1883840933,19,23864765.21,1256040.27


In [0]:
# stop live streaming
t_query.stop()
d_query.stop()