# Competition 1 Predicting News Popularity


## Load Package


In [46]:
import datetime
import os
import re
from dataclasses import dataclass, asdict
import joblib
import warnings
import mmap

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    HalvingRandomSearchCV,
)
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    HashingVectorizer,
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

import lightgbm as lgb
import xgboost as xgb
import catboost as cb


In [16]:
# Some CONSTANTS
RANDOM_STATE = 42
INPUT_DIR = "./input/"
OUTPUT_DIR = "./output/"
MODEL_SAVE_DIR = "./model_saves/"

# Create directories if not exist
if not os.path.exists(INPUT_DIR):
    os.makedirs(INPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

# Download nltk stopwords
nltk.download("stopwords")
STOP = stopwords.words("english")

# Stop warnings
warnings.filterwarnings("ignore")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huangmorris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Input & Output module


In [17]:
def input(chunksize: int = 1000, val_size: float = 0.2, stream: bool = True):
    """
    讀取輸入數據，並返回一個生成器和一個測試數據集。

    Args:
        chunksize: int，每次讀取的 chunk 大小。默認為 1000。
        val_size: float，驗證集的比例。默認為 0.2。
        stream: bool，是否使用 Out-of-Core learning。默認為 True。

    Returns:
        stream == True:
            stream_generator: 當用於進行 Out-of-Core learning 時，所使用的 stream generator。
                              詳細參考教學 Notebook 的 Out-of-Core 環節。
            testset: Dataclass: 包含了 test feature 和 ID。

        stream == False:
            trainset: Dataclass: 包含了 x_train, val_train, y_train, y_val。
            testset: Dataclass: 包含了 test feature 和 ID。
    """

    train_path = os.path.join(INPUT_DIR, "train.csv")
    test_path = os.path.join(INPUT_DIR, "test.csv")

    chunksize = (
        chunksize if stream else get_file_len(os.path.join(INPUT_DIR, "train.csv"))
    )

    df_test = pd.read_csv(test_path)
    x_test = df_test["Page content"]
    id_test = df_test["Id"]

    @dataclass
    class TestSet:
        x: pd.Series
        id: pd.Series

    return_item = (
        (get_stream(train_path, chunksize, val_size), TestSet(x_test, id_test))
        if stream
        else (
            next(get_stream(train_path, chunksize, val_size)),
            TestSet(x_test, id_test),
        )
    )

    return return_item


# 用於進行 Out-of-Core learning 時，所使用的 stream generator
def get_stream(train_path, chunksize, val_size=0.2):
    @dataclass
    class Dataset:
        x: pd.Series
        y: pd.Series

    # 將資料依照 validation size 分成 train/validataion
    for chunk in pd.read_csv(train_path, chunksize=chunksize):
        x = chunk["Page content"]
        y = chunk["Popularity"]

        yield Dataset(x, y)


def get_file_len(path):
    # 用於得到檔案的行數
    with open(path, "rb") as f:
        buf = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
        lines = 0
        while buf.readline():
            lines += 1
        buf.close()

        # Remove column rows
        return lines


In [18]:
def output(id_test: pd.Series, y_pred: np.ndarray):
    """
    將預測結果寫入 `OUTPUT_DIR` 資料夾中，並以當前時間命名

    Args:
        id_test: Pandas Series 包含 test data 的 id
        y_pred: NumPy array 包含預測結果（0d/1d）

    Returns:
        None
    """
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_filename = f"output_{timestamp}.csv"
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    output_df = pd.DataFrame({"Id": id_test.ravel(), "Popularity": y_pred})
    output_df.to_csv(output_path, index=False)


## Feature Enignnering


### Data cleaning

目前基本上是教學 Notebook 中的內容。


In [19]:
def text_cleaner(text: str):
    """
    清理文本數據，去除 HTML 標籤和表情符號，並將文本轉換為小寫字母。

    Args:
        text: str，需要清理的文本數據。

    Returns:
        str，清理後的文本數據。
    """

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").text

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = r"(?::|;|=|X)(?:-)?(?:\)|\(|D|P)"
    emoticons = re.findall(r, text)
    text = re.sub(r, "", text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = (
        re.sub(r"[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    )
    return text


def tokenizer_stem_nostop(text):
    """
    對文本進行分詞和詞幹提取。

    Args:
        text: str，需要進行分詞和詞幹提取的文本數據。

    Returns:
        list，包含了文本數據中的詞幹。
    """

    porter = PorterStemmer()
    return [
        porter.stem(w)
        for w in word_tokenize(text, preserve_line=True)
        if w not in STOP and re.match("[a-zA-Z]+", w)
    ]


In [38]:
def hash_vectorize(feature_params, texts):
    # 基本的 HashingVectorizer 轉換
    hash_vectorizer = HashingVectorizer(
        n_features=feature_params.hash_feature_num,
        preprocessor=text_cleaner,
        tokenizer=tokenizer_stem_nostop,
    )

    return pd.DataFrame(hash_vectorizer.fit_transform(texts).toarray())


def get_title(texts: pd.Series):
    texts = texts.apply(
        lambda x: BeautifulSoup(x, "html.parser").body.h1.string.strip().lower()
    ).rename("title")

    return pd.DataFrame(texts)


def get_topic(texts: pd.Series):
    def helper(text):
        a_list = BeautifulSoup(text, "html.parser").footer.find_all("a")
        topics = [re.sub("\s+", "-", a.string.strip().lower()) for a in a_list]
        return " ".join(topics)

    return pd.DataFrame(texts.apply(helper).rename("topic"))


def get_datetime(texts: pd.Series):
    def helper(text):
        try:
            datetime_str = BeautifulSoup(text, "html.parser").time["datetime"]
        except:
            datetime_str = "Thu, 01 Jan 2014 00:00:00 +0000"

        datetime_obj = datetime.datetime.strptime(
            datetime_str, "%a, %d %b %Y %H:%M:%S %z"
        )

        return pd.Series(
            {
                "year": datetime_obj.year,
                "month": datetime_obj.month,
                "day": datetime_obj.day,
                "hour": datetime_obj.hour,
                "minute": datetime_obj.minute,
                "second": datetime_obj.second,
            },
        )

    return pd.DataFrame(texts.apply(helper))


def vectorize_texts(df: pd.DataFrame, vec_idx: list, vectorizer):
    additional_dfs = [
        pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(df.loc[:, idx]))
        for idx in vec_idx
    ]

    return pd.concat(additional_dfs, axis=1)


### 調用函式

用於更輕鬆的調用之後新增的比如：加新特徵或其他前處理的 Function。

還有很多要調整。


In [21]:
@dataclass
class FeatureParams:
    do_get_title: bool = True
    do_get_topic: bool = True
    do_get_datetime: bool = True

    vectorizer: str = "count"


def get_features(feature_params: FeatureParams, texts: pd.Series):
    """
    對文本數據進行特徵工程，返回一包含轉換的基本特徵及生成的各種特徵的 DataFrame。

    Args:
        feature_params: FeatureParams，特徵參數。
        texts: pd.Series，文本數據。

    Returns:
        pd.DataFrame
    """

    def add_feature(original_df, additional_df):
        return (
            additional_df
            if type(original_df) != pd.DataFrame
            else pd.concat([original_df, additional_df], axis=1)
        )

    df = None
    vec_idx = []

    # TODO: 增加額外的特徵工程

    if feature_params.do_get_title:
        df = add_feature(df, get_title(texts))
        vec_idx += ["title"]

    if feature_params.do_get_topic:
        df = add_feature(df, get_topic(texts))
        vec_idx += ["topic"]

    if feature_params.do_get_datetime:
        df = add_feature(df, get_datetime(texts))

    if feature_params.vectorizer == "count" and len(vec_idx) != 0:
        df = add_feature(
            df,
            vectorize_texts(
                df, vec_idx, CountVectorizer(tokenizer=tokenizer_stem_nostop)
            ),
        )

    df = df.drop(columns=vec_idx, inplace=False)

    return df


## Training


In [31]:
def training(model, x_train, y_train):
    cv_result = cross_validate(
        model,
        x_train.values,
        y_train.values,
        cv=10,
        scoring="roc_auc",
        return_train_score=True,
        return_estimator=True,
        n_jobs=-1,
    )

    best_indice = np.argmax(cv_result["test_score"])
    train_score = cv_result["train_score"][best_indice]
    val_score = cv_result["test_score"][best_indice]

    print(f"Train score: {train_score:.4f}")
    print(f"Validation score: {val_score:.4f}")

    return cv_result["estimator"][best_indice]


In [34]:
cb_params = {
    "n_estimators": 300,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": RANDOM_STATE,
    "logging_level": "Silent",
    "thread_count": 4,
}


In [41]:
def pipeline(Model, model_params, model_name, feature_params):
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    model_filename = f"{model_name}_{timestamp}.joblib"
    model_save_path = os.path.join(MODEL_SAVE_DIR, model_filename)

    dataset, testset = input(stream=False)
    dataset.x = get_features(feature_params, dataset.x)
    dataset.y = dataset.y.replace(-1, 0)

    testset.x = get_features(feature_params, testset.x)

    clf = Model(**model_params)
    clf = training(clf, dataset.x, dataset.y)

    joblib.dump(clf, model_save_path)

    y_pred = clf.predict(testset.x.values)

    output(testset.id, y_pred)


In [50]:
def xgb_training(x_train, y_train):
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    model_filename = f"xgb_{timestamp}.joblib"
    model_save_path = os.path.join(MODEL_SAVE_DIR, model_filename)

    params = {
        "n_estimators": 300,
        "objective": "binary:logistic",
        "colsample_bytree": 0.3,
        "early_stopping_rounds": 10,
        "random_state": RANDOM_STATE,
    }

    params_distribution = {
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
        "max_depth": [3, 5, 7, 9, 11],
        "alpha": [0, 1, 3, 5, 7, 9, 10],
    }

    model = xgb.XGBClassifier(**params)

    cv = HalvingRandomSearchCV(
        model,
        params_distribution,
        factor=3,
        n_jobs=1,
        scoring="roc_auc",
        random_state=RANDOM_STATE,
    )
    cv.fit(x_train, y_train)

    print(f"Best validation score: {cv.best_score_:.4f}")

    model = cv.best_estimator_
    joblib.dump(model, model_save_path)

    return model


In [None]:
feature_params = FeatureParams(do_get_title=False, do_get_topic=True)

dataset, testset = input(stream=False)
dataset.x = get_features(feature_params, dataset.x)
dataset.y = dataset.y.replace(-1, 0)
testset.x = get_features(feature_params, testset.x)

xgb_model = xgb_training(dataset.x.values, dataset.y.values)
