In [32]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import * 
from pyspark.sql.window import Window 
from pyspark.sql import functions as F

In [33]:
import os

os.environ["PYSPARK_PYTHON"] = r"C:\Users\Admin\AppData\Local\Programs\Python\Python39\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\Admin\AppData\Local\Programs\Python\Python39\python.exe"

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()


In [34]:
data = spark.read.parquet("D:/study/dataset/log_search/20220601")
data_1 = spark.read.parquet("D:/study/dataset/log_search/20220701")

def process_log_search(data, alias_name="Most_Search"):
    data = data.select("user_id", "keyword") \
               .filter(F.col("user_id").isNotNull() & F.col("keyword").isNotNull())
    
    data = data.groupBy("user_id", "keyword").count()
    data = data.withColumnRenamed("count", "TotalSearch")
    data = data.orderBy("user_id",ascending = False )
    window = Window.partitionBy("user_id").orderBy(F.col("TotalSearch").desc())
    data = data.withColumn("Rank", F.row_number().over(window))
    data = data.filter(F.col("Rank") == 1)

    data = data.withColumnRenamed("keyword", alias_name)
    return data.select("user_id", alias_name)



In [35]:
df = process_log_search(data, alias_name="Most_Search_t6")
df_1 = process_log_search(data_1, alias_name="Most_Search_t7")
result = df.join(df_1, on='user_id', how='inner')

In [36]:
most_search_values = result.select("Most_Search_t6") \
    .union(result.select("Most_Search_t7")) \
    .distinct()

# df.coalesce(1).write.mode("overwrite") \
#     .option("header", True) \
#     .csv("D:/study/output/Most_Search_t6")


In [38]:
import pandas as pd 
# collect về Python list
unique_values = most_search_values.rdd.map(lambda row: row[0]).collect() 
# tạo DataFrame Pandas 
df = pd.DataFrame(unique_values, columns=["most_search"]) 
# ghi ra CSV với BOM 
df.to_csv("D:/study/output/most_search_values_inner.csv", index=False, encoding="utf-8-sig")

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# 1. Đọc file đã gán nhãn (bạn cần cung cấp file này)
train_df = pd.read_csv("D:/study/output/labeled_data.csv")   # có cột: content, category

# 2. Tạo mô hình pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=20000)),
    ("clf", LogisticRegression(max_iter=1000))
])

# 3. Train model
model.fit(train_df["content"], train_df["category"])

# 4. Đọc file most_search_values.csv (24450 dòng)
df_input = pd.read_csv("D:/study/output/most_search_values.csv")   # có cột: content

# 5. Dự đoán category
df_input["predicted_category"] = model.predict(df_input["content"].fillna(""))

# 6. Xuất file kết quả
output_path = "D:/study/output/most_search_with_predicted_category.csv"
df_input.to_csv(output_path, index=False, encoding="utf-8-sig")

print("✅ Done! File đã lưu:", output_path)


✅ Done! File đã lưu: D:/study/output/most_search_with_predicted_category.csv
