# Filtering PySpark dataframes
## March 2018
### Dr Jose M Albornoz

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [3]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [4]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [5]:
filename = 'McD_Card_Data/CT_201709_p1.csv'
df_p1 = load_data(filename, schema)

In [6]:
df_p1.show()

+------------+---------------+----------------+----------------+------------------+-----------+--------------------+-----------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|card_scheme|           pan_token|empty_field|
+------------+---------------+----------------+----------------+------------------+-----------+--------------------+-----------+
|         262|              3|      2017/09/01|               0|               437|          S|D08D751E07A1775D5...|       null|
|         262|             24|      2017/09/01|               2|               319|          S|D943C4129260645C5...|       null|
|         262|             23|      2017/09/01|               3|               396|          S|0F07B1D413871FD68...|       null|
|         262|             22|      2017/09/01|               3|               579|          S|528D75D2EE6F72329...|       null|
|         262|             21|      2017/09/01|               3|                99|          S|E9

In [7]:
df_p1.count()

9962423

In [8]:
df_p1.registerTempTable('data1')

## 4.1.- Unique stores

In [9]:
df_stores = sqlContext.sql("SELECT DISTINCT store_number FROM data1")

In [10]:
df_stores.show()

+------------+
|store_number|
+------------+
|         833|
|         496|
|         463|
|         148|
|        1238|
|         471|
|        1342|
|         737|
|         540|
|        1507|
|        1483|
|        1395|
|        1460|
|        1127|
|         623|
|        1084|
|        1025|
|         858|
|         897|
|          31|
+------------+
only showing top 20 rows



In [11]:
df_stores.count()

1256

In [12]:
df_stores.registerTempTable('stores')

# 5.- Filter data to generate smaller dataset

In [13]:
from datetime import datetime
dt_parse = udf(lambda x: datetime.strptime(x,"%Y/%m/%d"), DateType())
df_p2 = df_p1.withColumn('datetime', dt_parse(df_p1.transaction_date))

In [14]:
df_p2.show()

+------------+---------------+----------------+----------------+------------------+-----------+--------------------+-----------+----------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|card_scheme|           pan_token|empty_field|  datetime|
+------------+---------------+----------------+----------------+------------------+-----------+--------------------+-----------+----------+
|         262|              3|      2017/09/01|               0|               437|          S|D08D751E07A1775D5...|       null|2017-09-01|
|         262|             24|      2017/09/01|               2|               319|          S|D943C4129260645C5...|       null|2017-09-01|
|         262|             23|      2017/09/01|               3|               396|          S|0F07B1D413871FD68...|       null|2017-09-01|
|         262|             22|      2017/09/01|               3|               579|          S|528D75D2EE6F72329...|       null|2017-09-01|
|         262|      

In [15]:
df_p2.registerTempTable('data2')

In [17]:
df_p3 = sqlContext.sql("SELECT * FROM data2 WHERE DAY(datetime) < 4")

In [18]:
df_p3.count()

3264391

In [19]:
df_p3.registerTempTable('data3')

In [20]:
df_stores_reduced = sqlContext.sql("SELECT DISTINCT store_number FROM data3")

In [21]:
df_stores_reduced.count()

1250

In [22]:
df_stores_reduced.registerTempTable('stores_reduced')

# 6.- Nested SQL queries to filter data

In [26]:
df_filtered = sqlContext.sql("SELECT DISTINCT store_number FROM data1 WHERE store_number NOT IN (SELECT * FROM stores_reduced)")

In [27]:
df_filtered.count()

6