In [7]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    TimestampType,
    DecimalType,
    LongType,
    BooleanType,
    DoubleType,
)

In [8]:
session = SparkSession.builder.master("local[1]").appName("LaterProject").getOrCreate()
spark = session.sparkContext
data = range(10000)
distData = spark.parallelize(data)
distData.filter(lambda x: not x & 1).take(10)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [9]:
# event,timestamp,user_id,platform,hashtag_count
sch_event = StructType(
    [
        StructField("event", StringType(), nullable=False),
        StructField("timestamp", TimestampType(), nullable=False),  # timestamps are required
        StructField("user_id", StringType(), nullable=False),       # user_id is the merge-key and I asssume it is not Nullable.
                                                                    # In cases (like Google Analytics events) that user_id is null,
                                                                    # a stitching strategy should be implemented prior to data ingestion
        StructField("platform", StringType(), nullable=True),
        StructField("hashtag_count", DecimalType(), nullable=True), # hashtag_count is stored as float, better to be defined as Integer
    ]
)
df_events = session.read.options(header=True).schema(sch_event).csv("data/events.csv")
df_events.show(10)
df_events.printSchema()

+--------------------+-------------------+--------------------+--------+-------------+
|               event|          timestamp|             user_id|platform|hashtag_count|
+--------------------+-------------------+--------------------+--------+-------------+
|suggested_hashtag...|2021-11-07 09:56:00| -319166095449930469|    null|            1|
|scheduled_at_best...|2021-11-04 23:28:00|-7923241977147888618|     web|         null|
|suggested_hashtag...|2022-01-22 05:19:00|-2492747712229804050|    null|            9|
|scheduled_at_best...|2022-02-18 21:37:00|-3840912788726453110|    null|         null|
|suggested_hashtag...|2021-11-24 01:57:00| 5545543423372056573|    null|            8|
|suggested_hashtag...|2022-01-02 14:34:00|-2889228288752847947|    null|            6|
|suggested_hashtag...|2021-12-22 22:17:00| 5966878808570092570|    null|            1|
|suggested_hashtag...|2021-11-30 10:54:00| 5661323330809189471|    null|            8|
|suggested_hashtag...|2021-12-09 20:27:00| 

In [10]:
# amount_due,attempt_count,attempted,charge_id,closed,currency,user_id,date,description,discount_id,
# ending_balance,forgiven,id,line_ids,paid,receipt_number,received_at,starting_balance,
# subscription_id,subtotal,tax,tax_percent,total,webhooks_delivered_at
sch_payment = StructType(
    [
        StructField("amount_due", DoubleType(), nullable=True),
        StructField("attempt_count", IntegerType(), nullable=True),
        StructField("attempted", BooleanType(), nullable=True),
        StructField("charge_id", StringType(), nullable=True),
        StructField("closed", BooleanType(), nullable=True),
        StructField("currency", StringType(), nullable=True),
        StructField("user_id", StringType(), nullable=False),       # user_id is the merge-key and I asssume it is not Nullable.
        StructField("date", TimestampType(), nullable=False),       # payment date is required
        StructField("description", StringType(), nullable=True),
        StructField("discount_id", StringType(), nullable=True),
        StructField("ending_balance", DecimalType(), nullable=True),
        StructField("forgiven", BooleanType(), nullable=True),
        StructField("id", StringType(), nullable=False),
        StructField("line_ids", StringType(), nullable=True),
        StructField("paid", BooleanType(), nullable=True),
        StructField("receipt_number", StringType(), nullable=True),
        StructField("received_at", TimestampType(), nullable=True),
        StructField("starting_balance", DecimalType(), nullable=True),
        StructField("subscription_id", StringType(), nullable=True),
        StructField("subtotal", DecimalType(), nullable=True),
        StructField("tax", DecimalType(), nullable=True),
        StructField("tax_percent", DoubleType(), nullable=True),
        StructField("total", DecimalType(), nullable=True),
        StructField("webhooks_delivered_at", StringType(), nullable=True),
    ]
)
df_payment = session.read.options(header=True).schema(sch_payment).csv("data/payments_data.csv")
df_payment.show(10)
df_payment.printSchema()

+------------------+-------------+---------+---------+------+--------+--------------------+-------------------+-----------+--------------------+--------------+--------+--------------------+--------+----+--------------+-------------------+----------------+--------------------+--------+----+-----------+-----+---------------------+
|        amount_due|attempt_count|attempted|charge_id|closed|currency|             user_id|               date|description|         discount_id|ending_balance|forgiven|                  id|line_ids|paid|receipt_number|        received_at|starting_balance|     subscription_id|subtotal| tax|tax_percent|total|webhooks_delivered_at|
+------------------+-------------+---------+---------+------+--------+--------------------+-------------------+-----------+--------------------+--------------+--------+--------------------+--------+----+--------------+-------------------+----------------+--------------------+--------+----+-----------+-----+---------------------+
|5.8100

In [11]:
# id,active,delivered,media_id,media_item_id,posted_at,scheduled_at,social_profile_id,type,user_id

sch_post = StructType(
    [
        StructField("event", StringType(), nullable=False),
        StructField("id", StringType(), nullable=False),
        StructField("active", BooleanType(), nullable=True),
        StructField("delivered", BooleanType(), nullable=True),
        StructField("media_id", StringType(), nullable=True),
        StructField("media_item_id", StringType(), nullable=True),
        StructField("posted_at", TimestampType(), nullable=False),      # posted_at is required
        StructField("scheduled_at", TimestampType(), nullable=True),    # the post could be unscheduled, so Nullable
        StructField("social_profile_id", StringType(), nullable=True),
        StructField("type", StringType(), nullable=True),
        StructField("user_id", StringType(), nullable=False),           # user_id is the merge-key and I asssume it is not Nullable.
    ]
)
df_post = session.read.options(header=True).schema(sch_post).csv("data/posts.csv")
df_post.show(10)
df_post.printSchema()

22/09/17 12:34:55 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 10, schema size: 11
CSV file: file:///mnt/d/Projects/Later/notebooks/data/posts.csv
+--------------------+----+------+---------+--------------------+-------------------+-------------------+------------+-----------------+--------------------+-------+
|               event|  id|active|delivered|            media_id|      media_item_id|          posted_at|scheduled_at|social_profile_id|                type|user_id|
+--------------------+----+------+---------+--------------------+-------------------+-------------------+------------+-----------------+--------------------+-------+
|-2644227440758438680|True|  true|     null| 8702159562118430509|2021-11-14 00:05:00|2021-11-11 00:04:59|        null|    InstagramPost|-8530761503088838316|   null|
| 5054078446395994529|True| false|     null| 4010273432643314211|2021-12-09 22:33:00|2021-12-05 10:32:54|        nul

In [12]:
spark.stop()