#### DiC Assignment 2

Group 6
Members:
 Theresa Mayer
 Theresa Bruckner
 Jan Tölken
 Can Kenan Kandil 
 Thomas Klar


## Part 1

## Part 2

In [80]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [81]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, IDF, ChiSqSelector, IndexToString, StringIndexer, CountVectorizer, Normalizer
from pyspark.sql import SparkSession

In [82]:
spark = SparkSession.builder.appName("Assignment2").getOrCreate()

In [83]:
path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
input_file = spark.read.format("json").load(path).select("category", "reviewText")

In [84]:
input_file.show(n=5)

+--------------------+--------------------+
|            category|          reviewText|
+--------------------+--------------------+
|Patio_Lawn_and_Garde|This was a gift f...|
|Patio_Lawn_and_Garde|This is a very ni...|
|Patio_Lawn_and_Garde|The metal base wi...|
|Patio_Lawn_and_Garde|For the most part...|
|Patio_Lawn_and_Garde|This hose is supp...|
+--------------------+--------------------+
only showing top 5 rows



### Label Encoding

In [85]:
indexer = StringIndexer(inputCol="category", outputCol="label")
indexModel = indexer.fit(input_file)
input_file_1 = indexModel.transform(input_file)

In [86]:
reindexer = IndexToString(inputCol=indexer.getOutputCol(), outputCol="category_reindexed")
reindexer.transform(input_file_1).show(n=5)

+--------------------+--------------------+-----+--------------------+
|            category|          reviewText|label|  category_reindexed|
+--------------------+--------------------+-----+--------------------+
|Patio_Lawn_and_Garde|This was a gift f...| 18.0|Patio_Lawn_and_Garde|
|Patio_Lawn_and_Garde|This is a very ni...| 18.0|Patio_Lawn_and_Garde|
|Patio_Lawn_and_Garde|The metal base wi...| 18.0|Patio_Lawn_and_Garde|
|Patio_Lawn_and_Garde|For the most part...| 18.0|Patio_Lawn_and_Garde|
|Patio_Lawn_and_Garde|This hose is supp...| 18.0|Patio_Lawn_and_Garde|
+--------------------+--------------------+-----+--------------------+
only showing top 5 rows



### Tokenization

In [87]:
tokenizer = RegexTokenizer(inputCol='reviewText', outputCol='tokens', pattern=r"[ \t\d(){}\[\].!?;:,\-=\"~#@&*%€$§\\'\n\r\/]+", minTokenLength=2, toLowercase=True)

In [88]:
input_2 = tokenizer.transform(input_file_1)
input_2.show(n=5)

+--------------------+--------------------+-----+--------------------+
|            category|          reviewText|label|              tokens|
+--------------------+--------------------+-----+--------------------+
|Patio_Lawn_and_Garde|This was a gift f...| 18.0|[this, was, gift,...|
|Patio_Lawn_and_Garde|This is a very ni...| 18.0|[this, is, very, ...|
|Patio_Lawn_and_Garde|The metal base wi...| 18.0|[the, metal, base...|
|Patio_Lawn_and_Garde|For the most part...| 18.0|[for, the, most, ...|
|Patio_Lawn_and_Garde|This hose is supp...| 18.0|[this, hose, is, ...|
+--------------------+--------------------+-----+--------------------+
only showing top 5 rows



### Stopword Removal

In [89]:
stopword_file = "stopwords.txt"
with open(stopword_file, 'r', encoding='utf-8') as f:
    # Strip whitespace and convert to lowercase
    stopwords = [line.strip() for line in f]

In [90]:
stopword_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                                    outputCol="tokens_nostop",
                                    stopWords=stopwords)

In [91]:
input_3 = stopword_remover.transform(input_2)
input_3.select("tokens", "tokens_nostop").show(n=5)

+--------------------+--------------------+
|              tokens|       tokens_nostop|
+--------------------+--------------------+
|[this, was, gift,...|[gift, husband, m...|
|[this, is, very, ...|[nice, spreader, ...|
|[the, metal, base...|[metal, base, hos...|
|[for, the, most, ...|[part, works, pre...|
|[this, hose, is, ...|[hose, supposed, ...|
+--------------------+--------------------+
only showing top 5 rows



### TF-IDF Calculation

In [92]:
tf = CountVectorizer(inputCol=stopword_remover.getOutputCol(), 
                      outputCol="tf_output", 
                      vocabSize=40_000)

In [93]:
idf = IDF(inputCol=tf.getOutputCol(), 
          outputCol="tfidf_output",
          minDocFreq=4)

In [94]:
tfmodel = tf.fit(input_3)
input_4 = tfmodel.transform(input_3)
input_4.select("tokens_nostop", "tf_output").show(n=5)

                                                                                

+--------------------+--------------------+
|       tokens_nostop|           tf_output|
+--------------------+--------------------+
|[gift, husband, m...|(40000,[2,3,7,8,3...|
|[nice, spreader, ...|(40000,[0,1,3,21,...|
|[metal, base, hos...|(40000,[4,10,29,1...|
|[part, works, pre...|(40000,[1,3,4,9,1...|
|[hose, supposed, ...|(40000,[12,32,42,...|
+--------------------+--------------------+
only showing top 5 rows



In [95]:
idfModel = idf.fit(input_4)
input_5 = idfModel.transform(input_4)
input_5.select("tf_output", "tfidf_output").show(n=5)

                                                                                

+--------------------+--------------------+
|           tf_output|        tfidf_output|
+--------------------+--------------------+
|(40000,[2,3,7,8,3...|(40000,[2,3,7,8,3...|
|(40000,[0,1,3,21,...|(40000,[0,1,3,21,...|
|(40000,[4,10,29,1...|(40000,[4,10,29,1...|
|(40000,[1,3,4,9,1...|(40000,[1,3,4,9,1...|
|(40000,[12,32,42,...|(40000,[12,32,42,...|
+--------------------+--------------------+
only showing top 5 rows



### Selection of top 2000 features

In [96]:
chisq = ChiSqSelector(featuresCol=idf.getOutputCol(),
                      labelCol="label",
                      outputCol="features",
                      numTopFeatures=2000)

In [97]:
chisqModel = chisq.fit(input_5)
input_6 = chisqModel.transform(input_5)
input_6.select("features").show(n=5)

                                                                                

+--------------------+
|            features|
+--------------------+
|(2000,[2,3,7,8,35...|
|(2000,[0,1,3,21,3...|
|(2000,[4,10,174,3...|
|(2000,[1,3,4,9,10...|
|(2000,[12,29,101,...|
+--------------------+
only showing top 5 rows



### Pipeline Creation

In [98]:
def get_pipeline(n_features=2000):
    chisq.setNumTopFeatures(n_features)
    pipeline = Pipeline(stages=[
        indexer,
        tokenizer,
        stopword_remover,
        tf,
        idf,
        chisq
    ])
    return pipeline

In [99]:
pipeline = get_pipeline(n_features=2000)
preprocessing_pipeline = pipeline.fit(input_file)
preprocessing_pipeline.transform(input_file).select("label", "features").show(n=5)

                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 18.0|(2000,[2,3,7,8,35...|
| 18.0|(2000,[0,1,3,21,3...|
| 18.0|(2000,[4,10,174,3...|
| 18.0|(2000,[1,3,4,9,10...|
| 18.0|(2000,[12,29,101,...|
+-----+--------------------+
only showing top 5 rows



### Export most important tokens to file

In [100]:
def get_top_terms_from_pipeline(pipeline):
    n = len(pipeline.stages[5].selectedFeatures)


    vocab = pipeline.stages[3].vocabulary.copy()
    top_words = " ".join(sorted([vocab[i] for i in pipeline.stages[5].selectedFeatures]))
    
    with open("output_ds.txt", "w") as f:
        f.write(top_words)
        
    return n

In [101]:
get_top_terms_from_pipeline(preprocessing_pipeline)

2000

# Part 3

For this part we create a svm classifier to predict the categories based on their review text. 

First we import the necessary libraries from pyspark,...

In [102]:
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

import time 
from datetime import datetime

#to ignore the warning messages coming from the parameter grid search when training tvs
spark.sparkContext.setLogLevel("ERROR")

Now we add the needed steps for our pipeline, starting with a normalizer with L2 norm by using p=2

In [None]:
#Normalizer using features from the ChiSquare step and output should be the normalized features
normalizer = Normalizer(inputCol="features", outputCol="norm_features", p =2)


Next we create the classifier and add One-vs-Rest since SVM is only binary and we have a multi-class-problem. For further explanation on why OVR is used see the report. 

In [None]:
#SVM uses the normalized features and the created label from the indexer
svm = LinearSVC(labelCol="label", featuresCol="norm_features")
ovr = OneVsRest(classifier=svm, labelCol="label", featuresCol="norm_features")

Now we create the get_full_pipeline function to be able to switch between used features. 

In [None]:
#Pipeline with additional normalization and SVM with OVR 
def get_full_pipeline(n_features=2000):
    chisq.setNumTopFeatures(n_features)
    full_pipeline = Pipeline(stages=[
    indexer,
    tokenizer,
    stopword_remover,
    tf,
    idf,
    chisq,
    normalizer,
    ovr
    ])
    return full_pipeline

Now we create the parameter grid to compare the input parameters for SVM to get the best performing model based on F1-Score. 

In [None]:
paramGrid = (ParamGridBuilder()
    .addGrid(svm.regParam, [0.01, 0.1, 1.0])
    .addGrid(svm.maxIter, [10, 100])
    .addGrid(svm.standardization, [True, False])
    .build()
)

Now we split the input data into training and test (we also had to desample the input file, since otherwise the training of the classifier in combination with the parameter grid would take too long). We decided to use 30% of the input data, since with this split we get a good ratio between computing time and sample size. 

Here we dont split the data into training and validation since this will be done automaticall in the function TrainValidationSplit. 

In [115]:
# Splitting dataset 
devset_sample, devset_rest = input_file.randomSplit([0.3,0.7], seed = 1234)
train_data, test_data = devset_sample.randomSplit([0.8, 0.2], seed=1234)

In [116]:
print(f"used data for modeling has {devset_sample.count()} rows")
print(f"training data has {train_data.count()} rows")
print(f" so {train_data.count() * 0.2} rows will be used for validation")

used data for modeling has 23665 rows
training data has 18905 rows


Lastly we need an evaluator with F1-metric.

In [117]:
# Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

## Model with 2000 features 

Now we train the model using the top 2000 features. We start with getting the full pipeline with n_features = 2000.

In [118]:
#Full pipeline with 2000 features
full_pipeline = get_full_pipeline(n_features=2000)


Now we set up the TrainValidationSplit setup for the computation later on. Here we predefined that we use 80% of the data for training and the remaining 20% for validation for parameter decision.

In [119]:
# TrainValidationSplit Setup
tvs = TrainValidationSplit(
    estimator=full_pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    trainRatio=0.8,
    parallelism=50
)


In [120]:
# Training with 2000 features 
start_time = time.time()
start_time_readable = datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')
print(f"starting at {start_time_readable}") 
tvs_model = tvs.fit(train_data)
fit_time = time.time() - start_time
print(f'fit_time={fit_time}')

starting at 2025-05-12 13:42:44


                                                                                

fit_time=3165.2696483135223


In [None]:
#fit_time=3726.6424901485443 bei 20 parallelism
#fit_time=2063.811951637268 bei 50 parallelism 
#fit_time=2226.436852455139 bei 100 
#fit_time=3263.985917568207 bei 50 parallelism und 30% von devset 

In [121]:
# Evaluation
test_predictions = tvs_model.transform(test_data)

In [122]:
best_model = tvs_model.bestModel

In [123]:
ovr_model = best_model.stages[-1] 
best_svm_model = ovr_model.getClassifier()

# Show Parameters
print("Best model:")
print(f"  regParam:        {best_svm_model.getRegParam()}")
print(f"  maxIter:         {best_svm_model.getMaxIter()}")
print(f"  standardization: {best_svm_model.getStandardization()}")

Best model:
  regParam:        0.1
  maxIter:         100
  standardization: True


In [124]:
test_f1 = evaluator.evaluate(test_predictions)

print(f"Test f1: {test_f1:.4f}")



Validation f1: 0.5712
Test f1: 0.5680


                                                                                

Validation f1: 0.5712
Test f1: 0.5680

## Model with 500 features

Now we do the same with only 500 features

In [125]:
full_pipeline_500 = get_full_pipeline(n_features=500)

In [126]:
# TrainValidationSplit Setup
tvs_500 = TrainValidationSplit(
    estimator=full_pipeline_500,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    trainRatio=0.8,
    parallelism=50
)


In [127]:
# Training with 500 features 
start_time = time.time()
start_time_readable = datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')
print(f"starting at {start_time_readable}") 
tvs_model_500 = tvs_500.fit(train_data)
fit_time = time.time() - start_time
print(f'fit_time={fit_time}')

starting at 2025-05-12 15:03:45


                                                                                

fit_time=3314.2471628189087


In [128]:
# Evaluation
test_predictions_500 = tvs_model_500.transform(test_data)

In [129]:
best_model_500 = tvs_model_500.bestModel

In [130]:
ovr_model_500 = best_model_500.stages[-1] 
best_svm_model_500 = ovr_model_500.getClassifier()

# Show best parameters
print("Best model:")
print(f"  regParam:        {best_svm_model_500.getRegParam()}")
print(f"  maxIter:         {best_svm_model_500.getMaxIter()}")
print(f"  standardization: {best_svm_model_500.getStandardization()}")

Best model:
  regParam:        0.01
  maxIter:         10
  standardization: True


In [131]:
test_f1_500 = evaluator.evaluate(test_predictions_500)

print(f"Test f1: {test_f1_500:.4f}")

[Stage 282330:>                                                     (0 + 2) / 2]

Test f1: 0.4944


                                                                                

### Additional part only for Report 

This part is only used for comparison of different parameters, in finished code this should be commented 

In [None]:
#for protocoll to see how the different settings perform 
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
#results = []
# Loop über alle Parameterkombinationen
#for params in paramGrid:
    # Setze die Parameter im SVM-Objekt
#    for param, value in params.items():
#        svm._set(**{param.name: value})

    # Pipeline neu aufbauen mit aktuellem SVM
#    current_pipeline = full_pipeline.copy()
#    current_pipeline.setStages(full_pipeline.getStages()[:-1] + [OneVsRest(classifier=svm)])

    # Trainiere das Modell
#    model = current_pipeline.fit(train_data)

    # Vorhersage auf Validierungsdaten
#    predictions = model.transform(val_data)

    # F1-Score berechnen
#    f1 = evaluator.evaluate(predictions)

    # Parameter & Score speichern
#    results.append((params, f1))

    # Ausgabe
#    print("Param-Kombi:")
#    for p, v in params.items():
#        print(f"  {p.name}: {v}")
#    print(f"  → F1-Score: {f1:.4f}\n")

                                                                                

Param-Kombi:
  regParam: 0.01
  maxIter: 10
  standardization: True
  → F1-Score: 0.5183



                                                                                

Param-Kombi:
  regParam: 0.01
  maxIter: 10
  standardization: False
  → F1-Score: 0.5179



                                                                                

Param-Kombi:
  regParam: 0.01
  maxIter: 100
  standardization: True
  → F1-Score: 0.5135



[Stage 295360:>                                                     (0 + 2) / 2]

In [None]:
#best_params, best_f1 = max(results, key=lambda x: x[1])
#print(f"Best combination → F1: {best_f1:.4f}")
#for p, v in best_params.items():
#    print(f"{p.name}: {v}")