In [None]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
from collections import deque
import pandas as pd
from io import StringIO
from pyspark.sql.functions import col, lower, regexp_replace, trim
import os, re
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator



In [None]:
class Model_Inference:
    def __init__(self):
        self.pipeline_path = '/content/nb_pipeline'
        self.nb_pipeline = self.load_pipeline()

    def load_pipeline(self):
        return PipelineModel.load(self.pipeline_path)

    def predict(self, df):
        df = (
            df.withColumn("text_clean", lower(col("texts")))
                  .withColumn("text_clean", regexp_replace(col("text_clean"), r"https?://\S+", ""))
                  .withColumn("text_clean", regexp_replace(col("text_clean"), r"[^\p{L}\p{N}\s]+", " "))
                  .withColumn("text_clean", trim(regexp_replace(col("text_clean"), r"\s+", " ")))
        )
        pred_new = self.nb_pipeline.transform(df)

        return pred_new




In [None]:
spark = (
    SparkSession.builder
    .appName("Colab_Spark_TextML")
    .master("local[*]")                # Dùng toàn bộ CPU có sẵn
    .config("spark.driver.memory", "8g")         # 8 GB cho driver (vừa đủ, tránh OOM)
    .config("spark.executor.memory", "2g")       # 2 GB cho executor (vì local mode, chỉ 1 executor)
    .config("spark.driver.maxResultSize", "2g")  # Giới hạn kết quả trả về driver
    .config("spark.sql.shuffle.partitions", "8") # Giảm số shuffle partitions để đỡ overhead
    .config("spark.default.parallelism", "8")    # Giới hạn song song ở mức hợp lý
    .getOrCreate()
)

In [None]:
inference = Model_Inference()


In [None]:
file_path = "/content/data.csv"
n_rows = 500_000
cols_needed = ["texts", "labels"]

with open(file_path, "r", encoding="utf-8") as f:
    header = f.readline()  # đọc header
    tail_lines = deque(f, maxlen=n_rows)

# Gộp lại thành chuỗi CSV
data_str = header + "".join(tail_lines)

# Đưa vào pandas rồi chuyển sang Spark
df_tail_pd = pd.read_csv(StringIO(data_str), usecols=cols_needed)
df_tail = spark.createDataFrame(df_tail_pd).repartition(8)
df_tail.rdd.getNumPartitions()


8

In [None]:
df_pred = inference.predict(df_tail)


In [None]:
df_pred.show(5)

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|               texts|labels|          text_clean|              tokens|     tokens_filtered|         rawFeatures|            features|       rawPrediction|         probability|prediction_nb|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|tiến sĩ mà để nợ ...|     0|tiến sĩ mà để nợ ...|[ti, n, s, m, n, ...|[ti, n, m, n, c, ...|(262144,[1303,141...|(262144,[1303,141...|[-249.18945028910...|[0.68728653424899...|          0.0|
|em nạp bằng mom...|     0|em na p bă ng mom...|[em, na, p, b, ng...|[em, na, p, b, ng...|(262144,[3386,141...|(262144,[3386,141...|[-813.63414883714...|[0.99999999979572...|          0.0|
|Thì đúng mà, crus...|     0|thì đúng mà crus

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='labels', rawPredictionCol='probability')
print('Area Under ROC:', evaluator.evaluate(df_pred))
print('Area Under PR:', evaluator.evaluate(df_pred, {evaluator.metricName: "areaUnderPR"}))


Area Under ROC: 0.8479859149746666
Area Under PR: 0.3154070809367029


In [None]:
multi_evaluator = MulticlassClassificationEvaluator(labelCol='labels', predictionCol='prediction_nb')
print('Accuracy:', multi_evaluator.evaluate(df_pred, {multi_evaluator.metricName: "accuracy"}))
print('F1:', multi_evaluator.evaluate(df_pred, {multi_evaluator.metricName: "f1"}))

Accuracy: 0.678964
F1: 0.7664033381436142
