In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import PipelineModel

import requests
import json
import re  # Import the regular expressions module
import csv
import pandas as pd
import datetime
import time
import uuid
from kafka import KafkaConsumer
from kafka import KafkaProducer

In [2]:
# In pwrshell, create topics

# for topic in BusinessNewsTopic EntertainmentNewsTopic MalaysiaNewsTopic PoliticsNewsTopic SportsNewsTopic
# do
#   echo "Creating topic: $topic"
#   bin/kafka-topics.sh --create \
#     --topic $topic \
#     --bootstrap-server localhost:9092 \
#     --partitions 1 \
#     --replication-factor 1
# done


In [3]:
# Run once from ZW's code
# Define CATEGORY_KEYWORDS dictionary
CATEGORY_KEYWORDS = {
    'PoliticsNewsTopic': [
        'parliament', 'minister', 'government', 'election', 'policy', 'vote', 
        'cabinet', 'PM', 'democracy', 'corruption', 'political', 'politician',
        'law', 'bill', 'constitution', 'amendment', 'opposition', 'campaign',
        'UMNO', 'PAS', 'PKR', 'DAP', 'Bersatu', 'Pakatan', 'Barisan', 'budget'
    ],
    'BusinessNewsTopic': [
        'economy', 'market', 'stock', 'investment', 'company', 'business', 
        'trade', 'finance', 'bank', 'ringgit', 'profit', 'revenue', 'CEO',
        'entrepreneur', 'startup', 'commerce', 'industry', 'economic', 
        'inflation', 'recession', 'growth', 'GST', 'tax', 'BURSA', 'FDI'
    ],
    'SportsNewsTopic': [
        'football', 'badminton', 'hockey', 'athlete', 'tournament', 'championship',
        'league', 'match', 'player', 'coach', 'team', 'sport', 'medal', 'win',
        'game', 'score', 'FIFA', 'Olympic', 'Petronas', 'stadium', 'final',
        'competition', 'record', 'JDT', 'Selangor', 'Perak', 'Malaysia Super League'
    ],
    'EntertainmentNewsTopic': [
        'movie', 'music', 'concert', 'celebrity', 'actor', 'actress', 'film',
        'entertainment', 'drama', 'show', 'artist', 'singer', 'star', 'TV',
        'Netflix', 'performance', 'premiere', 'award', 'festival', 'viral',
        'album', 'song', 'talent', 'meme', 'trending', 'Astro', 'Media Prima'
    ]
}

# Load your CSV data
df = pd.read_csv('tweets_output_with_sentiment.csv')

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Define function to categorize tweets based on keywords
def categorize_tweet(tweet_text):
    # Ensure tweet_text is a string
    if isinstance(tweet_text, str):  # Check if tweet_text is a string
        tweet_text = tweet_text.lower()  # Make the text case-insensitive
        for category, keywords in CATEGORY_KEYWORDS.items():
            if any(keyword in tweet_text for keyword in keywords):  # Check if any keyword matches
                return category
    return 'MalaysiaNewsTopic'  # Default topic if no category is found or invalid tweet

# Iterate and send data to the appropriate Kafka topic
for index, row in df.iterrows():
    tweet_data = {
        'user_id': row['User ID'],
        'name': row['Name'],
        'followers_count': row['Followers Count'],
        'tweet_text': row['Tweet'],
        'sentiment': row['Sentiment']
    }

    # Categorize the tweet into the correct topic
    topic = categorize_tweet(row['Tweet'])

    # Send each row to the appropriate Kafka topic
    producer.send(topic, value=tweet_data)

    # Print confirmation (optional)
    print(f"Sent tweet from {row['Name']} to {topic}")

# Close the producer
producer.flush()
producer.close()
print("All tweets have been sent to Kafka")


Sent tweet from The Star to BusinessNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to EntertainmentNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to BusinessNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to MalaysiaNewsTopic
Sent tweet from The Star to BusinessNewsTopic
Sent tweet from The Star to EntertainmentNewsTopic
Sent tweet from The Star

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType
from pyspark.ml.tuning import CrossValidatorModel
from classes.TextPreprocessor import TextPreprocessor
from pyspark.sql.utils import StreamingQueryException
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("SentimentAnalysisStream")

In [5]:
# 1. Initialize Spark Session
spark = SparkSession.builder \
    .appName("Real-Time Sentiment Analysis") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.1") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

25/04/09 21:52:55 WARN Utils: Your hostname, Ck. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/09 21:52:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/hduser/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/student/.ivy2/cache
The jars for the packages stored in: /home/student/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1cc8e5bd-66f8-48cb-b919-22e0d7ecf939;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2

In [6]:
try:
    # 2. Fit preprocessing pipeline on static data
    static_df = spark.read.csv(
        "tweets_output_with_sentiment_1.csv",
        header=True, inferSchema=True
    ).selectExpr("Tweet", "Sentiment as sentiment") \
     .filter(col("Tweet").isNotNull()) \
     .filter(col("Tweet") != "")

    text_preprocessor = TextPreprocessor(input_col="Tweet", label_col="sentiment")
    preprocessing_pipeline = text_preprocessor.get_pipeline()
    pipeline_model = preprocessing_pipeline.fit(static_df)

except StreamingQueryException as e:
    logger.error("Streaming Query Exception: %s", e)
except Exception as e:
    logger.error("General Exception: %s", e)

                                                                                

In [7]:
from pyspark.sql.types import StructType, StringType


try:
    # 3. Define Kafka tweet schema
    tweet_schema = StructType() \
    .add("user_id", StringType()) \
    .add("name", StringType()) \
    .add("followers_count", StringType()) \
    .add("tweet_text", StringType()) \
    .add("sentiment", StringType())

except StreamingQueryException as e:
    logger.error("Streaming Query Exception: %s", e)
except Exception as e:
    logger.error("General Exception: %s", e)

In [8]:
# 4. Read from Kafka
raw_kafka = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "BusinessNewsTopic,EntertainmentNewsTopic,MalaysiaNewsTopic,PoliticsNewsTopic,SportsNewsTopic") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

# raw_kafka.selectExpr("CAST(value AS STRING)").writeStream.outputMode("append").format("console").start().awaitTermination()

# 4B. Convert raw Kafka JSON to structured DataFrame
# Parse JSON from Kafka
parsed_data = raw_kafka.selectExpr("CAST(value AS STRING) as json_string") \
    .select(from_json(col("json_string"), tweet_schema).alias("data")) \
    .select("data.*")

# Rename & filter
json_data = parsed_data \
    .withColumnRenamed("tweet_text", "Tweet") \
    .filter(col("Tweet").isNotNull())

In [10]:
# 5. Apply preprocessing
preprocessed_data = pipeline_model.transform(json_data)

# 6. Load trained model
model = CrossValidatorModel.load("hdfs://localhost:9000/user/student/random_forest")

# 7. Predict sentiment
predicted_data = model.bestModel.transform(preprocessed_data)

In [11]:
# # 8A. Output to console
# # Run only console output first
# console_query = predicted_data.select("Tweet", "prediction", "sentiment", "name") \
#     .writeStream \
#     .outputMode("append") \
#     .format("console") \
#     .option("truncate", False) \
#     .start()

# console_query.awaitTermination()

# Write to memory table instead of console
memory_query = predicted_data.select("Tweet", "prediction", "sentiment", "name") \
    .writeStream \
    .queryName("sentimentStream") \
    .outputMode("append") \
    .format("memory") \
    .start()

# Wait until data appears
import time
while True:
    df = spark.sql("SELECT * FROM sentimentStream")
    if df.count() > 0:
        df.show(truncate=False)
        memory_query.stop()
        break
    time.sleep(1)

25/04/10 01:02:36 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c1a5eba7-9699-48a1-87a6-13ef2f8c5b68. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/10 01:02:36 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/10 01:02:36 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/04/10 01:02:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/10 01:02:37 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Uninterrupti

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------+-------------------+
|Tweet                                                                                                                                                                                                                                                    |prediction|sentiment|name               |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------+-------------------+
|#Singapore Lee: “This is going to affect our trade, it’s going to affect our economy, it’s going to affect our region, a

In [12]:
# 8B. Output to HDFS in Parquet format
parquet_query = predicted_data.select("Tweet", "prediction", "sentiment", "name") \
    .writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "hdfs://localhost:9000/user/student/processed_sentiments") \
    .option("checkpointLocation", "hdfs://localhost:9000/user/student/checkpoints/sentiments") \
    .start()

25/04/10 01:10:38 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct
import pymongo

In [1]:
from de_classes.data_loader import data_loader
import json

file = "full_sentiments.json"

try:
    data = data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records")
except AttributeError:
    # Method 2: Alternative approach if the import is problematic
    print("Import failed. Trying alternative approach...")
    
    # Import the specific file directly
    import sys
    import os
    
    # Add the parent directory to the path if needed
    current_dir = os.path.dirname(os.getcwd())
    if current_dir not in sys.path:
        sys.path.append(current_dir)
    
    # Re-import with the full path
    import de_classes.data_loader
    # Force reload in case it was previously imported incorrectly
    import importlib
    importlib.reload(de_classes.data_loader)
    
    # Try again
    data = de_classes.data_loader.data_loader.from_json_lines(file)
    print(f"Successfully loaded {len(data)} records with alternative method")


Successfully loaded 300 records


In [2]:
from de_classes.pymongo_utils import PyMongoUtils

# Initialize your connection
mongo_obj = PyMongoUtils()

# Define database and collection names
mongo_db = "twitter_sentiment"  
mongo_collection = "sentiment_analysis"

# Get or create collection
collection = mongo_obj.get_collection(mongo_db, mongo_collection)


In [3]:

from de_classes.sentiment_insertion import sentiment_insertion

try:
    # Insert data 
    inserter = sentiment_insertion(collection)
    inserter.insert_many(data)
    print("Data inserted successfully to MongoDB Atlas!")
    
except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Please check your internet connection and verify the connection string.")



Data inserted successfully to MongoDB Atlas!


In [7]:
from de_classes.sentiment_query import sentiment_query

try:
    query = sentiment_query(collection)

    # Get tweets that mentionining "keyword"
    tourism_tweets = query.search_tweets("tourism")
    print(f"Found {len(tourism_tweets)} tweets mentioning 'tourism'")
    
    # print a sample:
    for tweet in tourism_tweets[:99]:
        print(tweet)

except Exception as e:
    print(f"Error connecting to MongoDB Atlas: {e}")
    print("Unable to query data")

Found 3 tweets mentioning 'tourism'
{'_id': ObjectId('67f7c665c824564f2443bd95'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67f7d1253fb5842680afba02'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
{'_id': ObjectId('67f7d1b4879e5494807a5c9b'), 'Tweet': 'According to Widiyanti, the tourism sector, classified as a service industry, is not subject to such tariffs and continues to generate foreign exchange.', 'prediction': 0.0, 'sentiment': 'Neutral', 'name': 'The Star'}
