# Libraries

In [None]:
import pandas as pd
from pyspark.ml.feature import PCA

from pyspark import SparkConf

from pyspark.ml.classification import LinearSVC, OneVsRest

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import (

    ChiSqSelector,

    RegexTokenizer,

    StringIndexer,

    IDF,

    StopWordsRemover,

    CrossValidator,

    Normalizer,

    CountVectorizer,

    HashingTF,

    ChiSqSelector,

    Normalizer,

    StandardScaler,
    UnivariateFeatureSelector,

)

from pyspark.ml.tuning import (

    ParamGridBuilder,

    TrainValidationSplit,

    TrainValidationSplitModel,

)

from pyspark.mllib.evaluation import MulticlassMetrics
import re

from pyspark.sql import SparkSession

from pyspark.sql.functions import regexp_replace, split, col, udf

from pyspark.sql.types import IntegerType, StringType

from pyspark.ml import Pipeline, Transformer

from pyspark.ml.linalg import Vectors

from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

# File Paths

In [3]:
dev_file_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
full_file_path = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"

output_file_path = "output_assignment2.txt"

stop_words_path = "stopwords.txt"

# Spark Session

In [4]:
spark = SparkSession.builder.appName("Assignment_2_session").getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.

25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/05/04 10:34:32 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.


# Loading the data

In [5]:
df_full = spark.read.json(dev_file_path)
df = df_full.select("reviewText", "category")

# Code Taken from Part 2

In [None]:
# Define udf
@udf(returnType=StringType())
def casefold_text(text):
    if text is not None:
        return text.casefold()
    else:
        return None


# Create a custom casefolder
class CasefoldTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    def __init__(self, inputCol="text", outputCol="casefolded_text"):
        super(CasefoldTransformer, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol

    def _transform(self, df):
        return df.withColumn(self.outputCol, casefold_text(df[self.inputCol]))

In [7]:
with open(stop_words_path, "r") as stop_words_file:
    stop_words = list(stop_words_file.read().splitlines())

In [None]:
casefolder = CasefoldTransformer(
    inputCol="reviewText", outputCol="reviewText_casefolded"
)

tokenizer = RegexTokenizer(
    inputCol="reviewText_casefolded",
    outputCol="tokens",
    pattern="[ \t\d\(\)\[\]\{\}\.\!\?,;:+=\-_'\"`~#@&\*\%€\$§\\/]+",
    gaps=True,
    toLowercase=False,
)

remover = StopWordsRemover(
    inputCol="tokens", outputCol="filtered_tokens", stopWords=stop_words
)

tf = CountVectorizer(inputCol="filtered_tokens", outputCol="rawFeatures")

idf = IDF(inputCol="rawFeatures", outputCol="features")

indexer = StringIndexer(inputCol="category", outputCol="category_index")

# Part 3

 In this part, you will train a text classifier from the features extracted in Part 2. The goal is to train a model that
 can predict the product category from a review's text.  To this end, extend the pipeline from Part 2 such that a Support Vector Machine classifier is trained. Since
 we are dealing with multi-class problems, make sure to put a strategy in place that allows binary classifiers to
 be applicable. Apply vector length normalization before feeding the feature vectors into the classifier (use
 Normalizer with L2 norm).  Follow best practices for ML experiment design and investigate the effects of parameter settings using the
 functions provided by Spark:
 -  Split the review data into training, validation, and test set.
 - Make experiments reproducible.
 - Use a grid search for parameter optimization:
 Compare chi square overall top 2000 filtered features with another, heavier filtering with much
 less dimensionality (see Spark ML documentation for options).


 - Compare different SVM settings by varying the regularization parameter (choose 3 different
 values), standardization of training features (2 values), and maximum number of iterations (2
 values).
 Use the MulticlassClassificationEvaluator to estimate performance of your trained
 classifiers on the test set, using F1 measure as criterion.

In [None]:
# to not overload the server we did gridsearch on a 10% subset of the dev data
df_sampled = df.sample(withReplacement=False, fraction=0.1, seed=42)
df_sampled.count()

In [None]:
# vector length normalization L2 Norm
normalizer = Normalizer(inputCol="selectedFeatures", outputCol="normalizedFeatures")

# add scaler
scaler = StandardScaler(
    inputCol="selectedFeatures", outputCol="scaledFeatures", withMean=False
)

# setup SVM
svm = LinearSVC(featuresCol="normalizedFeatures", labelCol="category_index")

# to allow binary classification, we use a one-vs-all classifier
ova = OneVsRest(
    classifier=svm,
    featuresCol="normalizedFeatures",
    labelCol="category_index",
    parallelism=4,  # we use parallelism = 4 to not overload the server
)

# split data into train, validation, test, adding a seed to make it reproucible
train_data_sampled, validation_data_sampled, test_data_sampled = df_sampled.randomSplit(
    [0.6, 0.2, 0.2], seed=1
)  # for the gridsearch
train_data, validation_data, test_data = df.randomSplit(
    [0.6, 0.2, 0.2], seed=1
)  # for final model application

# evaluator
f1 = MulticlassClassificationEvaluator(
    labelCol="category_index", predictionCol="prediction", metricName="f1"
)

# we try 2 selectors, one from task 2 and one more with heavier filtering
selectors = [
    ChiSqSelector(
        numTopFeatures=2000,
        featuresCol="features",
        outputCol="selectedFeatures",
        labelCol="category_index",
    ),
    UnivariateFeatureSelector(
        selectionMode="numTopFeatures",
        labelCol="category_index",
        featuresCol="features",
        outputCol="selectedFeatures",
    )
    .setFeatureType("categorical")
    .setLabelType("categorical")
    .setSelectionThreshold(100), #number of top features
]

# grid search:
- chi square overall top 2000 filtered features
- different SVM settings by varying the regularization parameter (choose 3 different values)
- standardization of training features (2 values)
- maximum number of iterations (2 values)

In [None]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(svm.regParam, [0.01, 0.1, 1])
    .addGrid(svm.maxIter, [100, 500])
    .addGrid(svm.standardization, [True, False])
    .build()
)

To not overload server, the loop was unrolled and the different combinations were applied one after another.
But to keep things structured, we want to hand in the looped code.

In [None]:
%%time

best_f1_score = 0
best_model = None

for selector in selectors: #iterate through selectors
    print(f"\nEvaluating selector with top {selector.getNumTopFeatures()} features\n" + "-"*50)

    pipeline = Pipeline(
        stages=[
            casefolder,
            tokenizer,
            remover,
            indexer,
            tf,
            idf,
            selector,
            scaler,
            normalizer,
            ova,
        ]
    )

    for params in paramGrid:
        model = pipeline.copy(params).fit(train_data_sampled)
        predictions = model.transform(validation_data_sampled)
        f1_score = f1.evaluate(predictions)

        print(f"Selector top {selector.getNumTopFeatures()}, Params: {params}, F1 Score: {f1_score:.4f}")

        if f1_score > best_f1_score:
            best_f1_score = f1_score
            best_model = model
            best_selector = selector
            best_params = params

print(f"\nBest Validation F1 Score: {best_f1_score:.4f}")

Params: {Param(parent='LinearSVC_36bf267fccea', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='LinearSVC_36bf267fccea', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='StandardScaler_5680ee9c641b', name='withMean', doc='Center data with mean'): False}, F1 Score: 0.6028949592378424
Params: {Param(parent='LinearSVC_36bf267fccea', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='LinearSVC_36bf267fccea', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='StandardScaler_5680ee9c641b', name='withMean', doc='Center data with mean'): True}, F1 Score: 0.6025472627023611


# Applying the best model on the full dev set:

In [None]:
%%time

print("Running: regParam=0.1, maxIter=100, standardization=True")
final_pipeline = Pipeline(
    stages=[
        casefolder,
        tokenizer,
        remover,
        indexer,
        tf,
        idf,
        best_selector,
        scaler,
        normalizer,
        ova,
    ]
)

final_model = final_pipeline.copy(best_params).fit(train_data)
pred = final_model.transform(validation_data)
f1_score = f1.evaluate(pred)
print(f"Final model F1 Score: {f1_score}")

In [None]:
model.write().overwrite().save("best_svm_model")