In [1]:
from pyspark.sql import SparkSession as ss
from pyspark.sql import functions as f

spark = (
    ss.builder.appName("ModelTraining")
    .config("spark.executor.memory", "16G")
    .getOrCreate()
)

In [3]:
number_regex = "[^a-zA-Z']"
double_space_regex = " +"

In [4]:
IN_PATH = "CLEAN"
OUT_PATH = "MODEL"
schema="polarity FLOAT, id LONG, date_time TIMESTAMP, query string, user STRING, text STRING"

spark_reader = spark.read.schema(schema)

df = spark_reader.parquet(IN_PATH)
df = (
df
#Remove all numbers
.withColumn("text", f.regexp_replace(f.col("text"), number_regex, " "))
.withColumn("text", f.regexp_replace(f.col("text"), double_space_regex, " "))
.withColumn("text", f.trim(f.col("text")))
.filter(f.col("text") != " ")
)

In [5]:
data = df.select("text", "polarity").coalesce(3).cache()

In [6]:
seed = 2021
(training, validation, test) = data.randomSplit([0.98, 0.1, 0.1], seed=seed)

In [7]:
%%time
from pyspark.ml.feature import (
    StopWordsRemover, 
    Tokenizer,
    HashingTF,
    IDF
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


tokenizer = Tokenizer(inputCol="text", outputCol="words1")
stopwordsrem = StopWordsRemover(
    inputCol = tokenizer.getOutputCol(),
    outputCol = "words2",
    stopWords = StopWordsRemover.loadDefaultStopWords('english')
)
hashingtf = HashingTF(
    inputCol=stopwordsrem.getOutputCol(),
    outputCol="term_freq")
idf = IDF(
    inputCol=hashingtf.getOutputCol(),
    outputCol="features",
    minDocFreq=5
)

lr = LogisticRegression(labelCol="polarity")

sem_analysis_pipeline = Pipeline(stages=[tokenizer, stopwordsrem, hashingtf, idf, lr])

sem_analysis_model = sem_analysis_pipeline.fit(training)

CPU times: user 311 ms, sys: 175 ms, total: 486 ms
Wall time: 4min 17s


In [20]:
%%time 

trained_df = sem_analysis_model.transform(training)
valid_df = sem_analysis_model.transform(validation)
test_df = sem_analysis_model.transform(test)

CPU times: user 192 ms, sys: 50.1 ms, total: 243 ms
Wall time: 436 ms


In [21]:
%%time 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as MCE

evaluator = MCE(labelCol="polarity", metricName="accuracy")

accuracy_val = evaluator.evaluate(valid_df)
accuracy_test = evaluator.evaluate(test_df)
print("Validation Accuracy:", accuracy_val * 100)
print("Testing Accuracy:", accuracy_val * 100)
print("Testing Accuracy : {accuracy_val * 100:.5f}%"")


Validation Accuracy: 77.17547624842356
Testing Accuracy: 77.17547624842356
CPU times: user 21.3 ms, sys: 7.61 ms, total: 28.9 ms
Wall time: 36.7 s


In [12]:
%%time
final_model = sem_analysis_pipeline.fit(data)
#accuracy_full = evaluator.evaluate(final_model.transform(test))
print("Accuracy : {accuracy_full * 100:.5f}%")

Accuracy : {accuracy_full * 100:.5f}%
CPU times: user 492 ms, sys: 465 ms, total: 957 ms
Wall time: 21min 17s


In [13]:
final_model.save(OUT_PATH)

Py4JJavaError: An error occurred while calling o951.save.
: java.io.IOException: Path MODEL already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:702)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:179)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
