# Sentiment Analysis of Customer Feedback - PySpark Starter Code
This notebook demonstrates how to process, train, and evaluate a machine learning model for sentiment analysis of customer feedback using PySpark.

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
print("Running in Colab:", IN_COLAB)

Running in Colab: False


In [2]:
# !pip install pyspark  # Already preinstalled on Dataproc
# gcsfs not needed in Dataproc

In [3]:
#from google.colab import auth
#auth.authenticate_user()

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sentiment Analysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 17:29:45 INFO SparkEnv: Registering MapOutputTracker
25/06/23 17:29:45 INFO SparkEnv: Registering BlockManagerMaster
25/06/23 17:29:45 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/06/23 17:29:45 INFO SparkEnv: Registering OutputCommitCoordinator


In [5]:
# Load the CSV directly from GCS (no need to download locally on Dataproc)
bucket_name = "cis-415-project-jedwardr"
# file_name = "sentiment_small_dataset.csv"
file_name = "sentiment_big_dataset.csv"
gcs_path = f"gs://{bucket_name}/{file_name}"

df = spark.read.csv(gcs_path, header=True, inferSchema=True)
df.show(5)
df.printSchema()

[Stage 2:>                                                          (0 + 1) / 1]

+---------------+--------------------+---------------+-------------+----------------+-------------------+--------------------+-------------------+-------+
|Sentiment_Score|       Feedback_Text|Feedback_Length|Response_Time|Customer_Segment|Interaction_Channel|           Survey_ID|         Time_Stamp|Country|
+---------------+--------------------+---------------+-------------+----------------+-------------------+--------------------+-------------------+-------+
|              1|Excellent experience|             20|        56.85|         Premium|               Chat|668688ad-9965-4cb...|2024-03-13 03:26:41|  India|
|              1|       Great service|             13|        55.55|         Premium|              Phone|1be3fb37-03c8-488...|2024-04-27 12:46:59| Canada|
|              2|    Agent was polite|             16|        33.67|         Premium|              Phone|0efd4a67-ab8c-4c0...|2024-03-26 02:02:44| Canada|
|              1|          Quick help|             10|        12.65|  

                                                                                

In [6]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

In [7]:
# Exploratory Data Analysis (EDA)
df.describe().show()
# Check for missing values
df.select([col(c).isNull().alias(c) for c in df.columns]).show()

25/06/23 17:32:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+------------------+-----------------+------------------+----------------+-------------------+--------------------+-------+
|summary|   Sentiment_Score|     Feedback_Text|  Feedback_Length|     Response_Time|Customer_Segment|Interaction_Channel|           Survey_ID|Country|
+-------+------------------+------------------+-----------------+------------------+----------------+-------------------+--------------------+-------+
|  count|           1000000|           1000000|          1000000|            989998|         1000000|            1000000|             1000000|1000000|
|   mean|          1.300932|              NULL|       117.620502|30.494507645470332|            NULL|               NULL|                NULL|   NULL|
| stddev|0.7807307736451352|              NULL|996.3299336929379| 17.02967640897917|            NULL|               NULL|                NULL|   NULL|
|    min|                 0|  Agent was polite|               10|               1.0|          

In [8]:
# Data Preprocessing
df = df.na.drop()  # Drop rows with null values
df = df.withColumn('Sentiment_Score', col('Sentiment_Score').cast('int'))  # Ensure target is int
# Tokenizing the 'Feedback_Text' column
tokenizer = Tokenizer(inputCol='Feedback_Text', outputCol='words')
# Vectorizing the words column
vectorizer = CountVectorizer(inputCol='words', outputCol='features')
# Indexing the target variable 'Sentiment_Score'
indexer = StringIndexer(inputCol='Sentiment_Score', outputCol='label')

In [9]:
# Train/Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
train_df.show(5)

[Stage 7:>                                                          (0 + 1) / 1]

+---------------+----------------+---------------+-------------+----------------+-------------------+--------------------+-------------------+-------+
|Sentiment_Score|   Feedback_Text|Feedback_Length|Response_Time|Customer_Segment|Interaction_Channel|           Survey_ID|         Time_Stamp|Country|
+---------------+----------------+---------------+-------------+----------------+-------------------+--------------------+-------------------+-------+
|              0|Agent was polite|             16|         1.01|        Standard|              Phone|c96bd065-dc67-430...|2024-02-21 01:53:19|     UK|
|              0|Agent was polite|             16|         1.02|         Premium|               Chat|ec83b2d2-8941-4b2...|2024-01-11 10:21:44|  India|
|              0|Agent was polite|             16|         1.02|        Standard|              Phone|731888f4-b02b-4b1...|2024-02-05 14:14:00|  India|
|              0|Agent was polite|             16|         1.03|           Basic|             

                                                                                

In [10]:
# Train Logistic Regression Model
lr = LogisticRegression(maxIter=10, regParam=0.01)
lr_pipeline = Pipeline(stages=[tokenizer, vectorizer, indexer, lr])

lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)

lr_predictions.select('Feedback_Text', 'Sentiment_Score', 'prediction').show(5)

[Stage 27:>                                                         (0 + 1) / 1]

+----------------+---------------+----------+
|   Feedback_Text|Sentiment_Score|prediction|
+----------------+---------------+----------+
|Agent was polite|              0|       0.0|
|Agent was polite|              0|       0.0|
|Agent was polite|              0|       0.0|
|Agent was polite|              0|       0.0|
|Agent was polite|              0|       0.0|
+----------------+---------------+----------+
only showing top 5 rows



                                                                                

In [11]:
# Train Naive Bayes Model
nb = NaiveBayes(modelType='multinomial', labelCol='label', featuresCol='features')
nb_pipeline = Pipeline(stages=[tokenizer, vectorizer, indexer, nb])

nb_model = nb_pipeline.fit(train_df)
nb_predictions = nb_model.transform(test_df)

                                                                                

In [12]:
# Model Evaluation (Accuracy + F1)

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
lr_accuracy = evaluator.evaluate(lr_predictions)
nb_accuracy = evaluator.evaluate(nb_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# F1 Score
f1_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
lr_f1 = f1_eval.evaluate(lr_predictions)
nb_f1 = f1_eval.evaluate(nb_predictions)

print(f"Logistic Regression F1 Score: {lr_f1:.4f}")
print(f"Naive Bayes F1 Score: {nb_f1:.4f}")

                                                                                

Logistic Regression Accuracy: 0.5000
Naive Bayes Accuracy: 0.5000




Logistic Regression F1 Score: 0.3333
Naive Bayes F1 Score: 0.3333


                                                                                

# Next Steps
Once the model is validated with the small dataset, you can scale this pipeline to the big dataset using Dataproc.