In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840634 sha256=aadd05d63b83400aca798711633ba047bafcf8be2e418e6b2ae82870f8fc638f
  Stored in directory: /Users/kowshikmosalakanti/Library/Caches/pip/wheels/2e/d2/18/6f4f20e8332359f7fffceb6828edcc80ef96f86744192a7bb9
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.3

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, lit, udf, explode
from pyspark.sql.types import StringType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Sentiment Analysis").getOrCreate()

# Load Dataset
file_path = "amazon_reviews.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

24/11/30 20:01:11 WARN Utils: Your hostname, Kowshiks-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.20.10.105 instead (on interface en0)
24/11/30 20:01:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/30 20:01:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Preprocessing
def preprocess_data(df):
    # Keep necessary columns and drop rows with null values
    df = df.select("reviewText", "overall").na.drop()

    # Labeling sentiment based on overall rating (Positive: >=4, Negative: <4)
    df = df.withColumn("label", when(col("overall") >= 4, 1).otherwise(0))

    # Tokenization
    tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")

    # Stop Words Removal
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")

    # Feature Extraction (TF-IDF)
    count_vectorizer = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    # Assemble stages
    pipeline = Pipeline(stages=[tokenizer, remover, count_vectorizer, idf])
    model = pipeline.fit(df)
    return model.transform(df)

df_preprocessed = preprocess_data(df)

                                                                                

In [5]:
# Split data into training and test sets
train, test = df_preprocessed.randomSplit([0.8, 0.2], seed=42)

# Train Sentiment Classification Model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train)

24/11/30 20:01:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/30 20:01:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

In [6]:
# Evaluate Model
predictions = model.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")

# Sentiment Scoring by Product
# Aggregate sentiment scores for each product
sentiment_udf = udf(lambda x: "Positive" if x == 1 else "Negative", StringType())
df_with_sentiment = predictions.withColumn("sentiment", sentiment_udf(col("prediction")))

df_sentiment_scores = (
    df_with_sentiment.groupBy("overall")
    .agg(expr("AVG(prediction) AS avg_sentiment_score"))
)

Test Accuracy: 0.89


In [None]:
# Save results to CSV
output_path = "path_to_output.csv"
df_sentiment_scores.write.csv(output_path, header=True)

print("Sentiment analysis completed and results saved to output.")

[Stage 49:>                                                         (0 + 1) / 1]