### Connect spark and deltalake

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("DeltaLake with Hive Integration") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.warehouse.dir", "hive") \
    .config("spark.python.worker.timeout", "1200") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.python.worker.reuse", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .enableHiveSupport() \
    .getOrCreate()

df_twcs = spark.read.format("delta").load("hdfs://localhost:9000/delta_twcs")
print(df_twcs)

DataFrame[message: string, timestamp_kafka: timestamp]


### Preprocessing tweets data

In [4]:
import pandas as pd 
import numpy as np 
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StopWordsRemover

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

Map delta format data to dataframe

In [5]:
df_parsed = df_twcs.select(
    col("timestamp_kafka"),
    from_json(
        col("message"),
        "tweet_id string, author_id string, inbound string, created_at string, text string, response_tweet_id string, in_response_to_tweet_id string"
    ).alias("data"))


df_column = df_parsed.select(
    col("timestamp_kafka"),
    col("data.tweet_id").alias("tweet_id"),
    col("data.author_id").alias("author_id"),
    (col("data.inbound") == "True").alias("inbound"),
    col("data.created_at").alias("created_at"),
    col("data.text").alias("text"),
    col("data.response_tweet_id").alias("response_tweet_id"),
    col("data.in_response_to_tweet_id").alias("in_response_to_tweet_id")
).filter(col("tweet_id").isNotNull())


In [5]:
df_parsed.show(10, truncate=False)
df_column.show(10, truncate=False)
df_parsed.printSchema()

+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|timestamp_kafka        |data                                                                                                                                                                                                             |
+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2025-06-27 12:57:26.818|{1317615, TMobileHelp, False, Fri Oct 27 08:13:27 +0000 2017, @427759 What's going on, Hadas? *NaeJ, , 1317616}                                                                                                  |
|2025-06-27 12:57:26.818|{NULL, NULL, NULL, NULL, NULL, 

Convert to lowercase

In [6]:
df_twcs_clean = df_column.withColumn("text_clean", lower(col("text")))
df_twcs_clean.show(10, truncate=False)

+-----------------------+--------+-----------+-------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------+
|timestamp_kafka        |tweet_id|author_id  |inbound|created_at                    |text                                                                                                                                 |response_tweet_id|in_response_to_tweet_id|text_clean                                                                                                                           |
+-----------------------+--------+-----------+-------+------------------------------+---------------------------------------------------------------------------------------------------------------------------

Remove punctuation

In [7]:
df_twcs_clean = df_twcs_clean.withColumn(
    "text_clean", regexp_replace(col("text"), r"[^\w\s]", "")
)

df_twcs_clean.show(10, truncate=False)

+-----------------------+--------+-----------+-------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------------+---------------------------------------------------------------------------------------------------------------------------------+
|timestamp_kafka        |tweet_id|author_id  |inbound|created_at                    |text                                                                                                                                 |response_tweet_id|in_response_to_tweet_id|text_clean                                                                                                                       |
+-----------------------+--------+-----------+-------+------------------------------+-----------------------------------------------------------------------------------------------------------------------------------

Remove stopwords

In [8]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

df_twcs_clean = df_twcs_clean.withColumn("words", split(col("text_clean"), " ")) \
                  .transform(stop_words_remover.transform) \
                  .withColumn("text_clean", concat_ws(" ", col("filtered_words"))) \
                  .drop("words", "filtered_words")

In [21]:
df_twcs_clean.show(10, truncate=False)

+-----------------------+--------+-----------+-------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------------+--------------------------------------------------------------------------------+
|timestamp_kafka        |tweet_id|author_id  |inbound|created_at                    |text                                                                                                                                 |response_tweet_id|in_response_to_tweet_id|text_clean                                                                      |
+-----------------------+--------+-----------+-------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------

Tokenization

In [9]:
df_twcs_clean = df_twcs_clean.withColumn(
    "tokens", 
    expr("filter(split(text_clean, ' '), x -> NOT x RLIKE '^[0-9]+$')")
)

df_twcs_clean.select("text_clean", "tokens").show(10, truncate=False)

+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
|text_clean                                                                      |tokens                                                                                |
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
|427759 Whats going Hadas NaeJ                                                   |[Whats, going, Hadas, NaeJ]                                                           |
|427761 httpstcoxKtJVNqc2V RileyRed                                              |[httpstcoxKtJVNqc2V, RileyRed]                                                        |
|115911 Hi spoke rep place iPhone X order received sort confirmation email normal|[Hi, spoke, rep, place, iPhone, X, order, received, sort, confirmati

In [9]:
df_twcs_clean.select("text_clean").show(10, truncate=False)

+--------------------------------------------------------------------------------+
|text_clean                                                                      |
+--------------------------------------------------------------------------------+
|427759 Whats going Hadas NaeJ                                                   |
|427761 httpstcoxKtJVNqc2V RileyRed                                              |
|115911 Hi spoke rep place iPhone X order received sort confirmation email normal|
|427761 confirm account jump right RileyRed                                      |
|426997 see DM responding shortly Thanks hanging us JeromyBingham                |
|426997 want make sure information order Lets take look send us DM JeromyBingham |
|426997 nah going store need 3rd                                                 |
|426998 bruh keep trying                                                         |
|426998 Got bro dunno go back sleep wait go work early lol                       |
|426

## Sentiment Analysis

Create label

In [10]:
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [11]:
positive_keywords = list(opinion_lexicon.positive())
negative_list = list(opinion_lexicon.negative())
custom_negative = ["sucks", "bad", "hate", "worst", "terrible", "awful", "disappointing", "fuck", "shit"]
negative_keywords = negative_list + custom_negative

df_twcs_clean = df_twcs_clean.withColumn(
    "sentiment",
    when(
        size(array_intersect(col("tokens"), array([lit(x) for x in positive_keywords]))) > 0, "positive"
    ).when(
        size(array_intersect(col("tokens"), array([lit(x) for x in negative_keywords]))) > 0, "negative"
    ).otherwise("neutral")
)

In [27]:
df_twcs_clean.select("author_id","text_clean", "sentiment").show(1000, truncate=False)

+---------------+------------------------------------------------------------------------------------------------------------------------------------------+---------+
|author_id      |text_clean                                                                                                                                |sentiment|
+---------------+------------------------------------------------------------------------------------------------------------------------------------------+---------+
|TMobileHelp    |427759 Whats going Hadas NaeJ                                                                                                             |neutral  |
|TMobileHelp    |427761 httpstcoxKtJVNqc2V RileyRed                                                                                                        |neutral  |
|427762         |115911 Hi spoke rep place iPhone X order received sort confirmation email normal                                                          |neutral  

### Save to Data Warehouse

In [14]:
from os.path import abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row

warehouse_location = abspath('spark-warehouse')

spark_hive = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

Create database

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS db_tgp2")

DataFrame[]

Schema-on-Write

In [13]:
spark.sql("USE db_tgp2")
spark.sql("CREATE TABLE IF NOT EXISTS customer_tweets (id BIGINT GENERATED ALWAYS AS IDENTITY, author_id STRING, inbound STRING, created_at STRING, text STRING, text_clean STRING, response_tweet_id STRING, in_response_to_tweet_id STRING, sentiment STRING) USING hive")

DataFrame[]

Write dataframe to Hive Table

In [14]:
df_twcs_clean.write.mode("overwrite").format("hive").saveAsTable("db_tgp2.customer_tweets")

Test inserted data

In [15]:
df = spark.sql("SELECT * FROM customer_tweets LIMIT 1000")
df.show()

+--------------------+--------+-----------+-------+--------------------+--------------------+-----------------+-----------------------+--------------------+--------------------+---------+
|     timestamp_kafka|tweet_id|  author_id|inbound|          created_at|                text|response_tweet_id|in_response_to_tweet_id|          text_clean|              tokens|sentiment|
+--------------------+--------+-----------+-------+--------------------+--------------------+-----------------+-----------------------+--------------------+--------------------+---------+
|2025-06-27 12:57:...| 1317615|TMobileHelp|  false|Fri Oct 27 08:13:...|@427759 What's go...|                 |                1317616|427759 Whats goin...|[Whats, going, Ha...|  neutral|
|2025-06-27 12:57:...| 1317618|TMobileHelp|  false|Fri Oct 27 08:13:...|@427761 https://t...|                 |                1317619|427761 httpstcoxK...|[httpstcoxKtJVNqc...|  neutral|
|2025-06-27 12:57:...| 1317621|     427762|   true|Fri Oct 2

### Save cleaned data to Data Lake

In [16]:
df_twcs_clean.write.mode("overwrite").format("parquet").save("hdfs://localhost:9000/hdfs_twcs_clean")

In [18]:
df2 = spark.read.parquet("hdfs://localhost:9000/hdfs_twcs_clean")
df2.show(10, truncate=False)


+-----------------------+--------+-----------+-------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+-----------------+-----------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+---------+
|timestamp_kafka        |tweet_id|author_id  |inbound|created_at                    |text                                                                                                                                 |response_tweet_id|in_response_to_tweet_id|text_clean                                                                      |tokens                                                                                |sentiment|
+-----------------------+--------+-----------+-------+------------------------------+-----------------------------------