#### Author: WEE LING HUE

In [1]:
#Import Necessary Libraries

from pyspark.sql import SparkSession
from classes.TextPreprocessor import TextPreprocessor
from classes.SentimentModelTrainer import SentimentModelTrainer
from classes.TextCleaner import TextCleaner
from classes.Lemmatizer import Lemmatizer
from classes.MalayStemmer import MalayStemmer
from pyspark.sql.functions import col, sum


In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

25/04/11 21:34:11 WARN Utils: Your hostname, LAPTOP-NU7VPFG6. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/11 21:34:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 21:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56154)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    se

## Load Data

In [3]:
# Read CSV correctly with header and infer schema

# # Load data from HDFS
# hdfs_path = "hdfs://localhost:9000/user/hduser/raw_tweets"
# df = spark.read.json(hdfs_path) 

# # Show top 5 sample of the data
# df.show(5)

df = spark.read.csv(
    "file:///home/student/de-prj/tweets_output_with_sentiment.csv",
    header=True,  # Use first row as column names
    inferSchema=True,  # Automatically detect data types
    multiLine=True,  # Handle multi-line tweets
    escape='"'  # Fix misaligned text with quotes
)

# # Step 4: Rename columns if needed
# correct_columns = ["User ID", "Name", "Followers Count", "Tweet", "Location", "Tweet Time", "" "Sentiment"]
# df = df.toDF(*correct_columns)

# Step 5: Show final cleaned DataFrame and row count
print(f" Total rows : {df.count()}")
df.show(5, truncate=False)

 Total rows : 300
+--------+--------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+------------------------------+-------------+---------+
|User ID |Name    |Followers Count|Tweet                                                                                                                                                                                   |Location              |Tweet Time                    |Friends Count|Sentiment|
+--------+--------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+------------------------------+-------------+---------+
|22594051|The Star|1903936        |Prime Minister Datuk Seri Anwar Ibrahim said the m

## Check Data

In [4]:
# Check for null values in each column
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

# Remove rows with null values
df = df.dropna()

# Check for duplicate rows
duplicate_count = df.count() - df.dropDuplicates().count()
print(f"Duplicate Rows: {duplicate_count}")

# Remove duplicate rows
df = df.dropDuplicates()

# Show first 5 rows after cleaning
df.show(5, truncate=False)

# Print total number of records after cleaning
print(f"Total Records After Cleaning: {df.count()}")


+-------+----+---------------+-----+--------+----------+-------------+---------+
|User ID|Name|Followers Count|Tweet|Location|Tweet Time|Friends Count|Sentiment|
+-------+----+---------------+-----+--------+----------+-------------+---------+
|      0|   0|              0|    1|       0|         0|            0|        0|
+-------+----+---------------+-----+--------+----------+-------------+---------+

Duplicate Rows: 0
+---------+-------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------------------+-------------+---------+
|User ID  |Name               |Followers Count|Tweet                                                                                                                                                                                               |Location     |Twee

## Preprocessing Data

In [5]:
# Split dataset into training (80%) and testing (20%)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Initialize the text preprocessing pipeline
text_preprocessor = TextPreprocessor(input_col="Tweet", label_col="Sentiment")

# Get the preprocessing pipeline
pipeline = text_preprocessor.get_pipeline()

# Fit and transform the training data
pipeline_model = pipeline.fit(train_data)
processed_train_data = pipeline_model.transform(train_data)

# Transform the test data using the same pipeline model
processed_test_data = pipeline_model.transform(test_data)

# Show processed data
processed_train_data.select("Tweet", "clean_text", "words", "filtered_words", "lemmatized_words", "stemmed_words", "features", "label").show(10, truncate=False)


[Stage 38:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|Tweet                                                                                |clean_text                                                                        |words                                                                               

                                                                                

## Save Preprocessed Training & Test Data

In [6]:
# Save processed training data
processed_train_data.write.mode("overwrite").parquet("hdfs://localhost:9000/user/student/processed_train_data")

# Save processed testing data
processed_test_data.write.mode("overwrite").parquet("hdfs://localhost:9000/user/student/processed_test_data")


                                                                                

In [13]:
processed_train_data.select("label", "sentiment").distinct().show()

+-----+---------+
|label|sentiment|
+-----+---------+
|  2.0| Positive|
|  1.0|  Neutral|
|  0.0| Negative|
+-----+---------+



## Save features and label for model building

In [8]:
# Path to store processed data in HDFS
hdfs_path = "hdfs://localhost:9000/user/student/preprocessed_data.parquet"

# Save the DataFrame to HDFS in Parquet format
processed_train_data.select("features", "label").write.mode("overwrite").parquet(hdfs_path)

print(f"Preprocessed data saved to {hdfs_path}")

[Stage 53:>                                                         (0 + 1) / 1]

Preprocessed data saved to hdfs://localhost:9000/user/student/preprocessed_data.parquet


                                                                                

# Start Here

In [1]:
# Step 1: Import Necessary Libraries
from pyspark.sql import SparkSession
from classes.TextPreprocessor import TextPreprocessor
from classes.SentimentModelTrainer import SentimentModelTrainer
from classes.TextCleaner import TextCleaner
from classes.Lemmatizer import Lemmatizer
from classes.MalayStemmer import MalayStemmer

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()

# Load the preprocessed data from HDFS
loaded_data = spark.read.parquet("hdfs://localhost:9000/user/student/preprocessed_data.parquet")
# Load training and testing data from HDFS
train_data = spark.read.parquet("hdfs://localhost:9000/user/student/processed_train_data")
test_data = spark.read.parquet("hdfs://localhost:9000/user/student/processed_test_data")


# Verify the loaded data
loaded_data.show(5)

25/04/11 23:54:25 WARN Utils: Your hostname, LAPTOP-NU7VPFG6. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/11 23:54:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 23:54:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(10000,[955,1206,...|  0.0|
|(10000,[758,1526,...|  0.0|
|(10000,[777,4821,...|  1.0|
|(10000,[777,2679,...|  0.0|
|(10000,[758,1049,...|  2.0|
+--------------------+-----+
only showing top 5 rows



# Model Building

## 1. Naive Bayes

In [3]:
trainer = SentimentModelTrainer()

# Model: naive_bayes
model_type = "naive_bayes"

# Train the model
trained_model = trainer.train(train_data, model_type)

#Test the model
evaluation_results = trainer.evaluate(trained_model, train_data, test_data)

# Save the trained model to HDFS
trained_model.write().overwrite().save("hdfs://localhost:9000/user/student/nb_model")

# Print confirmation message
print("Model successfully saved at hdfs://localhost:9000/user/student/nb_model")


 Training Set Performance: 
  Accuracy: 0.9921
  Precision: 0.9921
  Recall: 0.9921
  F1: 0.9921

 Testing Set Performance: 
  Accuracy: 0.6596
  Precision: 0.6577
  Recall: 0.6596
  F1: 0.6583

!Possible Overfitting Detected! 
Training accuracy is 0.9921, but testing accuracy is 0.6596.
Model successfully saved at hdfs://localhost:9000/user/student/nb_model


                                                                                

## 2. Random Forest

In [4]:
trainer = SentimentModelTrainer()

# Model name: random_forest
model_type = "random_forest"  

# Train the model
trained_model = trainer.train(train_data, model_type)

#Test the model
evaluation_results = trainer.evaluate(trained_model, train_data, test_data)

# Save the trained model to HDFS
trained_model.write().overwrite().save("hdfs://localhost:9000/user/student/rf_model")

# Print confirmation message
print("Model successfully saved at hdfs://localhost:9000/user/student/rf_model")

                                                                                

 Training Set Performance: 
  Accuracy: 0.5198
  Precision: 0.7605
  Recall: 0.5198
  F1: 0.4788

 Testing Set Performance: 
  Accuracy: 0.4894
  Precision: 0.6722
  Recall: 0.4894
  F1: 0.4176

 No significant overfitting detected. Model generalizes well! 
Model successfully saved at hdfs://localhost:9000/user/student/rf_model


## 3. Logistic Regression

In [6]:
trainer = SentimentModelTrainer()

# Model name: logistic_regression
model_type = "logistic_regression" 

# Train the model
trained_model = trainer.train(train_data, model_type)

#Test the model
evaluation_results = trainer.evaluate(trained_model, train_data, test_data)

# Save the trained model to HDFS
trained_model.write().overwrite().save("hdfs://localhost:9000/user/student/lr_model")

# Print confirmation message
print("Model successfully saved at hdfs://localhost:9000/user/student/lr_model")

 Training Set Performance: 
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1: 1.0000

 Testing Set Performance: 
  Accuracy: 0.5745
  Precision: 0.5848
  Recall: 0.5745
  F1: 0.5720

!Possible Overfitting Detected! 
Training accuracy is 1.0000, but testing accuracy is 0.5745.
Model successfully saved at hdfs://localhost:9000/user/student/lr_model


## 4. Decision Tree

In [3]:
trainer = SentimentModelTrainer()

# Model name: decision_tree
model_type = "decision_tree"  

# Train the model
trained_model = trainer.train(train_data, model_type)

#Test the model
evaluation_results = trainer.evaluate(trained_model, train_data, test_data)

# Save the trained model to HDFS
trained_model.write().overwrite().save("hdfs://localhost:9000/user/student/dt_model")

# Print confirmation message
print("Model successfully saved at hdfs://localhost:9000/user/student/dt_model")

 Training Set Performance: 
  Accuracy: 0.6190
  Precision: 0.6805
  Recall: 0.6190
  F1: 0.6098

 Testing Set Performance: 
  Accuracy: 0.5532
  Precision: 0.5800
  Recall: 0.5532
  F1: 0.5194

 No significant overfitting detected. Model generalizes well! 
Model successfully saved at hdfs://localhost:9000/user/student/dt_model
