In [None]:
!pip install spark-nlp
!pip install pyspark cassandra-driver

Collecting spark-nlp
  Downloading spark_nlp-6.0.0-py2.py3-none-any.whl.metadata (19 kB)
Downloading spark_nlp-6.0.0-py2.py3-none-any.whl (684 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/684.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/684.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m684.9/684.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-6.0.0
Collecting cassandra-driver
  Downloading cassandra_driver-3.29.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting geomet<0.3,>=0.1 (from cassandra-driver)
  Downloading geomet-0.2.1.post1-py3-none-any.whl.metadata (1.0 kB)
Downloading cassandra_driver-3.29.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━

In [None]:
from google.colab import files
uploaded = files.upload()  # Chọn file secure-connect-bundle.zip

Saving secure-connect-moocassandra.zip to secure-connect-moocassandra.zip


In [None]:
import json

with open('/content/MOOCASSANDRA-token.json') as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]
ASTRA_SECURE_BUNDLE_PATH = list(uploaded.keys())[0]
KEYSPACE_NAME = "BIG_MOOC"

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AstraDB-Spark-Integration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.kryoserializer.buffer.max", "2000m") \
    .config("spark.jars.packages",
            "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.3,com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .getOrCreate()

spark.sparkContext.addFile(ASTRA_SECURE_BUNDLE_PATH)
spark.conf.set("spark.cassandra.connection.config.cloud.path", ASTRA_SECURE_BUNDLE_PATH)
spark.conf.set("spark.cassandra.auth.username", CLIENT_ID)
spark.conf.set("spark.cassandra.auth.password", CLIENT_SECRET)

In [None]:
try:
    df = spark.read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table="courses", keyspace="BIG_MOOC") \
        .load()

    print("Kết nối thành công! Dữ liệu mẫu:")
    df.show(5)
    df.printSchema()

except Exception as e:
    print(f"Lỗi khi kết nối: {str(e)}")

Kết nối thành công! Dữ liệu mẫu:
+---------+-------------------------------------+--------------------+-----+---------------------------+--------------------+
|course_id|                                about|            about_vn|field|                       name|             name_vn|
+---------+-------------------------------------+--------------------+-----+---------------------------+--------------------+
|C_1824931|网络攻防是网络空间这个看不见硝烟的...|hành vi phạm tội ...|   []|网络安全-应用技术与工程实践|công nghệ và thực...|
|C_1771164|                 Causes of environ...|nguyên nhân của c...|   []|       Environmental Pol...|các sự kiện ô nhi...|
|C_1789680|                 Enjoy Chinese med...|thưởng thức chế đ...|   []|       A Bite of Chinese...|một miếng medica ...|
|C_1751256|本课程是一门硕士研究生专业基础课程...|khóa học này là m...|   []|核反应堆工程（Nuclear Re...|kỹ thuật lò phản ...|
| C_943258|  通过学习你可以回答下列问题：\n如...|thông qua học tập...|   []|           物理学与世界进步|vật lý và sự tiến...|
+---------+-------------------------

# TopicClustering

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import PCA, CountVectorizer, IDF
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import SQLTransformer
from sparknlp.base import DocumentAssembler, EmbeddingsFinisher
from sparknlp.annotator import XlmRoBertaSentenceEmbeddings, Tokenizer, StopWordsCleaner
import numpy as np
from pyspark.sql import Row
import pyspark.sql.functions as F
import jieba
import pyspark.sql.types as T

class TopicClustering:
    def __init__(self, spark_session,
                 pca_dim=15,
                 random_state=42,
                 k_method="silhouette",
                 min_k=2,
                 max_k=15):
        self.spark = spark_session
        self.pca_dim = pca_dim
        self.random_state = random_state
        self.k_method = k_method.lower()
        self.min_k = min_k
        self.max_k = max_k

        self.embedding_model = None
        self.kmeans_model = None
        self.k = None
        self.transformed_df = None
        self.cv_model = None
        self.idf_model = None

        self.evaluator = ClusteringEvaluator(
            featuresCol="pca_features",
            predictionCol="topic",
            metricName=self.k_method,
            distanceMeasure="cosine"
        )

    def _tokenize_with_jieba(self, df):
      stop_words = set([
          "the", "a", "an", "of", "for", "and", "-",
          "on", "in", "with", "to", "at", "by", "from", "：", ",", "!", "《", ":", ";", "”", "“"
          "（", '）', "·", "+", "&", "--", "?", "，", "•", ".", ' ', '丨', '_', '【', '】', '、', '。', '？', '；'
      ])

      def jieba_tokenize(text):
          tokens = list(jieba.cut(text)) if text else []
          return [t for t in tokens if t.lower() not in stop_words and t.strip() != ""]

      jieba_udf = F.udf(jieba_tokenize, T.ArrayType(T.StringType()))
      return df.withColumn("tokens_array", jieba_udf(F.col("about")))

    def _build_embedding_pipeline(self):
        document_assembler = DocumentAssembler() \
            .setInputCol("about") \
            .setOutputCol("document")

        embeddings = XlmRoBertaSentenceEmbeddings.pretrained("sent_xlm_roberta_base", "xx") \
            .setInputCols(["document"]) \
            .setOutputCol("sentence_embeddings")

        finisher = EmbeddingsFinisher() \
            .setInputCols(["sentence_embeddings"]) \
            .setOutputCols(["finished_embedding"]) \
            .setOutputAsVector(True) \
            .setCleanAnnotations(False)

        vector_extractor = SQLTransformer(
            statement="SELECT *, finished_embedding[0] AS embedding_vector FROM __THIS__"
        )

        pca = PCA(
            k=self.pca_dim,
            inputCol="embedding_vector",
            outputCol="pca_features"
        )

        return Pipeline(stages=[
            document_assembler,
            embeddings,
            finisher,
            vector_extractor,
            pca
        ])

    def _build_text_analysis_pipeline(self):
        count_vectorizer = CountVectorizer(
            inputCol="tokens_array",
            outputCol="raw_features"
        )

        idf = IDF(
            inputCol="raw_features",
            outputCol="features"
        )

        return Pipeline(stages=[
            count_vectorizer,
            idf
        ])

    def _find_optimal_k(self, df_with_pca):
        scores = []
        models = []

        for k in range(self.min_k, self.max_k + 1):
            kmeans = KMeans(
                k=k,
                seed=self.random_state,
                featuresCol="pca_features",
                predictionCol="topic"
            )
            model = kmeans.fit(df_with_pca)
            predictions = model.transform(df_with_pca)
            score = self.evaluator.evaluate(predictions)
            scores.append(score)
            models.append(model)
            print(f"  - k={k}: {self.k_method} score = {score:.4f}")

        best_index = np.argmax(scores)
        return self.min_k + best_index, models[best_index]

    def fit(self, df):
        print("Pipeline embedding...")
        embedding_pipeline = self._build_embedding_pipeline()
        self.embedding_model = embedding_pipeline.fit(df)
        df_with_pca = self.embedding_model.transform(df).cache()

        print(f"Cluster: {self.min_k} to {self.max_k}...")
        self.k, self.kmeans_model = self._find_optimal_k(df_with_pca)
        print(f"Best k: {self.k}")

        clustered_df = self.kmeans_model.transform(df_with_pca).cache()
        df = self._tokenize_with_jieba(clustered_df)

        print("TF-IDF pipeline...")
        text_pipeline = self._build_text_analysis_pipeline()
        self.text_pipeline_model = text_pipeline.fit(df)

        self.transformed_df = self.text_pipeline_model.transform(df).cache()

        self.cv_model = self.text_pipeline_model.stages[0]
        self.idf_model = self.text_pipeline_model.stages[1]

        df_with_pca.unpersist()
        clustered_df.unpersist()

        return self

    def transform(self, df):
        df_with_pca = self.embedding_model.transform(df)
        clustered_df = self.kmeans_model.transform(df_with_pca)
        df = self._tokenize_with_jieba(clustered_df)
        self.transformed_df = self.text_pipeline_model.transform(df).cache()

        return self.transformed_df.select("course_id", "topic")

    def get_topics(self, n_words=10):
        vocab = self.cv_model.vocabulary
        idf = self.idf_model.idf.toArray()
        clustered_data = self.transformed_df

        topic_words = {}

        for topic_id in range(self.k):
            topic_docs = clustered_data.filter(F.col("topic") == topic_id)

            word_counts = topic_docs.select(
                F.explode("tokens_array").alias("word")
            ).groupBy("word").count()

            word_counts_pd = word_counts.toPandas()
            word_counts_pd = word_counts_pd[word_counts_pd['word'].isin(vocab)]

            word_scores = []
            for _, row in word_counts_pd.iterrows():
                word = row['word']
                tf = row['count']
                try:
                    word_idx = vocab.index(word)
                    idf_score = idf[word_idx]
                    score = tf * idf_score
                    word_scores.append((word, score))
                except ValueError:
                    continue

            word_scores.sort(key=lambda x: x[1], reverse=True)
            topic_words[topic_id] = [word for word, _ in word_scores[:n_words]]

        return topic_words

    def get_topic_info(self, n_words=10):
        clustered_data = self.transformed_df
        topic_keywords = self.get_topics(n_words=n_words)

        topic_counts = (
            clustered_data.groupBy("topic")
            .count()
            .toPandas()
            .set_index("topic")["count"]
            .to_dict()
        )

        topic_infos = []
        for topic_id in sorted(topic_keywords.keys()):
            keywords = topic_keywords[topic_id]
            name = f"{topic_id}_" + "_".join(keywords)
            count = topic_counts.get(topic_id, 0)
            topic_infos.append(Row(Topic=topic_id, Count=count, Name=name))

        return self.spark.createDataFrame(topic_infos).orderBy("Topic")

In [None]:
topicPipeline = TopicClustering(spark, min_k = 100, max_k=150)
topicPipeline.fit(df)
result_df = topicPipeline.transform(df)

Pipeline embedding...
sent_xlm_roberta_base download started this may take some time.
Approximate size to download 619.5 MB
[OK!]
Cluster: 100 to 150...
  - k=100: silhouette score = 0.1665
  - k=101: silhouette score = 0.1606
  - k=102: silhouette score = 0.1621
  - k=103: silhouette score = 0.1641
  - k=104: silhouette score = 0.1612
  - k=105: silhouette score = 0.1672
  - k=106: silhouette score = 0.1666
  - k=107: silhouette score = 0.1672
  - k=108: silhouette score = 0.1606
  - k=109: silhouette score = 0.1609
  - k=110: silhouette score = 0.1625
  - k=111: silhouette score = 0.1599
  - k=112: silhouette score = 0.1579
  - k=113: silhouette score = 0.1626
  - k=114: silhouette score = 0.1683
  - k=115: silhouette score = 0.1590
  - k=116: silhouette score = 0.1610
  - k=117: silhouette score = 0.1703
  - k=118: silhouette score = 0.1589
  - k=119: silhouette score = 0.1707
  - k=120: silhouette score = 0.1564
  - k=121: silhouette score = 0.1668
  - k=122: silhouette score = 0.1

In [None]:
topicPipeline.get_topic_info().show(n=150, truncate=False)

+-----+-----+---------------------------------------------------------------------------------+
|Topic|Count|Name                                                                             |
+-----+-----+---------------------------------------------------------------------------------+
|0    |39   |0_医学_遗传学_疾病_研究_预防_治疗_循证_诊断_临床_药物                            |
|1    |19   |1_is_course_students_Economics_engineering_English_basic_academic_help_compulsory|
|2    |14   |2_创业_总监_创新_时代_凭_互联网_知识产权_医药_创业者_人工智能                    |
|3    |9    |3_我们_选择_花卉_你_为什么_大学_“_音乐_可能_吗                                   |
|4    |51   |4_学生_计算机_和_数据库_分析_能力_软件测试_设计_学习_技术                        |
|5    |35   |5_工程_构造_基础_及_和_油气_的_结构_与_方程                                      |
|6    |27   |6_你_创业_！_创新_说法_以案_成为_一起_加入_申报                                  |
|7    |37   |7_俄语_摄影_你_沟通_和_学习_语言_的_也_高等数学                                  |
|8    |8    |8_试验_设计_神经网络_分布式_因子_人工智能_模拟_adams_专家系统_样机               |
|9    |17  

In [None]:
import shutil
import glob
import os

# Coalesce để gộp về 1 partition
result_df.coalesce(1).rdd \
    .map(lambda row: "\t".join([str(c) for c in row])) \
    .saveAsTextFile("/content/result_txt")
part_file = glob.glob("/content/result_txt/part-*")[0]
shutil.move(part_file, "/content/result.txt")

'/content/result.txt'