In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import avg
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Sentiment Analysis").getOrCreate()

24/12/01 13:54:19 WARN Utils: Your hostname, Kowshiks-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.20.10.105 instead (on interface en0)
24/12/01 13:54:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 13:54:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Load Dataset
reviews_df = spark.read.csv("Dataset-SA.csv", header=True, inferSchema=True)

# Display Dataset Schema
reviews_df.printSchema()

[Stage 1:>                                                          (0 + 8) / 8]

root
 |-- product_name: string (nullable = true)
 |-- product_price: string (nullable = true)
 |-- Rate: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Sentiment: string (nullable = true)




                                                                                

In [5]:
reviews_df.head(5)

[Row(product_name='Candes 12 L Room/Personal Air Cooler??????(White, Black, Elegant High Speed-Honey Comb Cooling Pad & Ice Chamber, Blower)', product_price='3999', Rate='5', Review='super!', Summary='great cooler excellent air flow and for this price its so amazing and unbelievablejust love it', Sentiment='positive'),
 Row(product_name='Candes 12 L Room/Personal Air Cooler??????(White, Black, Elegant High Speed-Honey Comb Cooling Pad & Ice Chamber, Blower)', product_price='3999', Rate='5', Review='awesome', Summary='best budget 2 fit cooler nice cooling', Sentiment='positive'),
 Row(product_name='Candes 12 L Room/Personal Air Cooler??????(White, Black, Elegant High Speed-Honey Comb Cooling Pad & Ice Chamber, Blower)', product_price='3999', Rate='3', Review='fair', Summary='the quality is good but the power of air is decent', Sentiment='positive'),
 Row(product_name='Candes 12 L Room/Personal Air Cooler??????(White, Black, Elegant High Speed-Honey Comb Cooling Pad & Ice Chamber, Blower

In [13]:
# Text Preprocessing
# Tokenize
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
tokenized_df = tokenizer.transform(reviews_df)

# Remove Stop Words
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = stop_words_remover.transform(tokenized_df)

In [14]:
# Count Vectorizer
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
vectorizer_model = vectorizer.fit(filtered_df)
vectorized_df = vectorizer_model.transform(filtered_df)

                                                                                

In [15]:
# IDF Transformation
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(vectorized_df)
idf_df = idf_model.transform(vectorized_df)

                                                                                

In [19]:
from pyspark.sql.functions import when, col

# Map sentiment values to integers (adjust mappings as needed)
idf_df = idf_df.withColumn(
    "Sentiment",
    when(col("Sentiment") == "positive", 1)
    .when(col("Sentiment") == "negative", 0)
    .otherwise(None)  # Handle any unexpected values
)

# Alternatively, if already numeric strings, cast directly
# idf_df = idf_df.withColumn("Sentiment", col("Sentiment").cast("int"))

In [23]:
# Split Data
train_df, test_df = idf_df.randomSplit([0.8, 0.2], seed=42)

# Train Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="Sentiment")
model = lr.fit(train_df)

24/12/01 14:18:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/01 14:18:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

In [25]:
# Evaluate Model
test_predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(test_predictions)
print(f"Model Accuracy: {accuracy}")



Model Accuracy: 0.9418757914868322



                                                                                

In [27]:
# Sentiment Scoring
# Predict Sentiments for All Data
predictions = model.transform(idf_df)

# Calculate Aggregate Sentiment Score per Product
sentiment_scores = predictions.groupBy("product_name").agg(avg("prediction").alias("average_sentiment_score"))
sentiment_scores.show()



+--------------------+-----------------------+
|        product_name|average_sentiment_score|
+--------------------+-----------------------+
|BLUECHIP BL43Smar...|     0.9072164948453608|
|CEAT Hitman Full ...|     0.8770053475935828|
|Butterfly Rapid K...|      0.868421052631579|
|ArrowMax Professi...|     0.9787234042553191|
|7SEVEN Airtel DTH...|                    1.0|
|SAMSUNG DW60M5042...|     0.7692307692307693|
|Candes Florence 1...|     0.8659217877094972|
|BAJAJ Crest Neo 1...|     0.8615819209039548|
|D&V ENGINEERING 2...|                    1.0|
|PUHBRHY Steel Ele...|     0.8928571428571429|
|Hindware Ripple 9...|     0.9007352941176471|
|Flipkart SmartBuy...|     0.8686708860759493|
|HAVELLS convenio ...|                    0.4|
|WONDERCHEF Nutri ...|     0.8235294117647058|
|D&V ENGINEERING 2...|                    1.0|
|BLUECHIP BL55Smar...|     0.9440559440559441|
|Prestige Atlas De...|     0.6419753086419753|
|SportSoul Cotton ...|     0.8279569892473119|
|Atomberg Ren


                                                                                

In [28]:
# Save Sentiment Scores to CSV
sentiment_scores.write.csv("sentiment_scores.csv", header=True)

# Stop Spark Session
spark.stop()

                                                                                