# Competition 1 Predicting News Popularity

Team Name: 窩不知道誒

Team Members: 112501533 黃思誠 111062632 曾靖驊 112065527 劉承瑋


## Load Package


In [1]:
import datetime
import mmap
import os
import re
from dataclasses import dataclass

import catboost as cb
import lightgbm as lgb
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from optuna.integration.lightgbm import LightGBMTuner


In [2]:
# Some CONSTANTS
RANDOM_STATE = 42
INPUT_DIR = "./input/"
OUTPUT_DIR = "./output/"
MODEL_SAVE_DIR = "./model_saves/"

# Create directories if not exist
if not os.path.exists(INPUT_DIR):
    os.makedirs(INPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

# Download nltk stopwords
nltk.download("wordnet")


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/matcha0714/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Input & Output module

設定輸入及輸出的格式


In [3]:
def input():
    train_path = os.path.join(INPUT_DIR, "train.csv")
    test_path = os.path.join(INPUT_DIR, "test.csv")

    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    @dataclass
    class TestSet:
        x: pd.Series
        id: pd.Series

    @dataclass
    class Dataset:
        x: pd.Series
        y: pd.Series

    dataset = Dataset(df_train["Page content"], df_train["Popularity"])
    testset = TestSet(df_test["Page content"], df_test["Id"])

    return dataset, testset


In [4]:
def output(id_test: pd.Series, y_pred: np.ndarray, info: str = None):
    if info == None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    else:
        timestamp = info

    output_filename = f"output_{timestamp}.csv"
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    output_df = pd.DataFrame({"Id": id_test.ravel(), "Popularity": y_pred})
    output_df.to_csv(output_path, index=False)


## Feature Enignnering


以下是我們在這次 Competition 中所使用的 Data extraction 及 cleaning 的方式：

- title: 文章標題
- topic: 文章所屬的章節主題（footer 中的 article-topics）
- channel: 文章所屬的頻道（article 中的 data-channel）
- datatime: 文章發表的時間（年、月、日、時、分、秒），若有缺失值則用 "Thu, 01 Jan 2014 00:00:00 +0000" 填補，並且將文字如 Mon, Tue, etc. 及 Jan, Feb, etc. 轉換為數字
- content_length: 文章的長度

最後再對 Title, Topic, Channel 等文字特徵進行處理：

1. 通過 Tokenizer 提取單字
2. 通過 WordNetLemmatizer 還原單字原型
3. 通過 CountVectorizer 計算單字的數量作為 Features


In [5]:
def tokenizer(text):
    text = re.sub(r"([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"[^\w]+", " ", text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(s) for s in re.split(r"\s+", text.strip())]


def get_title(soup_texts):
    return pd.DataFrame(
        soup_texts.apply(lambda x: x.body.h1.string.strip().lower()).rename("title")
    )


def get_topic(soup_texts):
    def helper(text):
        a_list = text.body.find("footer", {"class": "article-topics"}).find_all("a")
        topics = [re.sub(r"\s+", "-", a.string.strip().lower()) for a in a_list]
        return " ".join(topics)

    return pd.DataFrame(soup_texts.apply(helper).rename("topic"))


def get_datetime(soup_texts):
    def helper(text):
        try:
            datetime_str = text.time["datetime"]
        except:
            datetime_str = "Thu, 01 Jan 2014 00:00:00 +0000"

        datetime_obj = datetime.datetime.strptime(
            datetime_str, "%a, %d %b %Y %H:%M:%S %z"
        )

        return pd.Series(
            {
                "year": datetime_obj.year,
                "month": datetime_obj.month,
                "day": datetime_obj.day,
                "hour": datetime_obj.hour,
                "minute": datetime_obj.minute,
                "second": datetime_obj.second,
            },
        )

    return pd.DataFrame(soup_texts.apply(helper))


def get_content_length(soup_texts):
    def helper(text):
        content = text.find("section", class_="article-content").get_text()
        return len(content)

    return pd.DataFrame(soup_texts.apply(helper).rename("content_length"))


def get_channel(soup_texts):
    return pd.DataFrame(
        soup_texts.apply(
            lambda x: x.body.article["data-channel"].strip().lower()
        ).rename("channel")
    )


def vectorize_texts(df: pd.DataFrame, vec_idx: list, vectorizer):
    additional_dfs = [
        pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(df.loc[:, idx]))
        for idx in vec_idx
    ]

    return pd.concat(additional_dfs, axis=1)


In [6]:
class PreProcessor:
    def __init__(self) -> None:
        None

    def add_feature(self, original_df, additional_df):
        return (
            additional_df
            if type(original_df) != pd.DataFrame
            else pd.concat([original_df, additional_df], axis=1)
        )

    def preprocess(self, texts: pd.Series) -> pd.DataFrame:
        soup_texts = texts.apply(
            BeautifulSoup,
            args=[
                "html.parser",
            ],
        )
        title = get_title(soup_texts)
        topic = get_topic(soup_texts)
        channel = get_channel(soup_texts)
        datetime = get_datetime(soup_texts)
        content_length = get_content_length(soup_texts)

        return pd.concat([title, topic, datetime, channel, content_length], axis=1)

    def fit_transform(self, texts: pd.Series):
        df = self.preprocess(texts)

        self.trans = ColumnTransformer(
            [
                (idx, CountVectorizer(tokenizer=tokenizer, lowercase=False), idx)
                for idx in ["title", "topic", "channel"]
            ],
            remainder="drop",
            sparse_threshold=0.0,
        )

        trans_df = pd.DataFrame(self.trans.fit_transform(df))
        df.drop(["title", "topic", "channel"], axis=1, inplace=True)

        return pd.concat([df, trans_df], axis=1)

    def transform(self, texts: pd.Series):
        df = self.preprocess(texts)

        trans_df = pd.DataFrame(self.trans.transform(df))
        df.drop(["title", "topic", "channel"], axis=1, inplace=True)

        return pd.concat([df, trans_df], axis=1)


## Training


讀取 training, testing data，並進行上述的 Preprocess。再將 training data 切分為 train, validation set。


In [7]:
# Load input data
dataset, testset = input()

preprocessor = PreProcessor()
dataset.x = preprocessor.fit_transform(dataset.x)
testset.x = preprocessor.transform(testset.x)
dataset.y = dataset.y.replace(-1, 0)

cat_features = ["year", "month", "day", "hour", "minute", "second"]

# Split train and validation set
x_train, x_val, y_train, y_val = train_test_split(
    dataset.x, dataset.y, test_size=0.2, random_state=RANDOM_STATE
)




In [8]:
def trainer(model, x_train, y_train, x_val, y_val, train_params={}):
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    model.fit(x_train, y_train, **train_params)

    train_score = roc_auc_score(y_train, model.predict_proba(x_train)[:, 1])
    val_score = roc_auc_score(y_val, model.predict_proba(x_val)[:, 1])
    print(f"Train score: {train_score:.4f}")
    print(f"Validation score: {val_score:.4f}")

    return model, timestamp


## Model Selection

經過多次實驗後，選擇使用 CatBoost、LightGBM 作為最後 Voting classifier 的組合。原本還有嘗試 XGBoost，不過由於 Training 的時間稍長，加上表現不如另外兩者，因此最後不採納。

- **CatBoost**

Train score: 0.6567

Validate score: 0.5934

- **LightGBM**

Train score: 0.6554

Validate score: 0.5843

- **Voting**

最後將兩個模型的 Prediction 乘上權重並相加作為最終的 Prediction。

而由於 CatBoost 在分數上的表現比 lightGBM 好不少，因此權重上就以 CatBoost 為主，lightGBM 主要是期待會降低 Over-fitting，並帶來一點分數上的提升。

Weight = [0.9, 0.1]


### CatBoost


In [9]:
train_params = {
    "eval_set": [(x_val, y_val)],
}

cb_params = {
    "random_strength": 1.2,
    "border_count": 254,
    "bootstrap_type": "MVS",
    "mvs_reg": 0.3,
    "eval_metric": "AUC",
    "od_type": "IncToDec",
    "od_pval": 0.01,
    "loss_function": "CrossEntropy",
    "l2_leaf_reg": 3,
    "depth": 10,
    "od_pval": 0.01,
    "learning_rate": 0.03,
    "random_seed": RANDOM_STATE,
    "thread_count": -1,
    "cat_features": cat_features,
    "verbose": False,
}

cb_model = cb.CatBoostClassifier(
    **cb_params,
    iterations=1000,
    use_best_model=True,
    early_stopping_rounds=200,
)

cb_model, cb_timestamp = trainer(cb_model, x_train, y_train, x_val, y_val, train_params)
cb_iteration = cb_model.best_iteration_


Train score: 0.7392
Validation score: 0.5980


### LightGBM


In [10]:
train_params = {
    "eval_set": [(x_val, y_val)],
    "eval_metric": "auc",
    "callbacks": [lgb.early_stopping(100)],
}

lgb_params = {
    "objective": "binary",
    "learning_rate": 0.01,
    "max_depth": 10,
    "num_leaves": 34,
    "min_child_samples": 15,
    "subsample": 0.8,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
    "verbose": -1,
}

lgb_model = lgb.LGBMClassifier(**lgb_params, n_estimators=1000)

lgb_model, lgb_timestamp = trainer(
    lgb_model,
    x_train,
    y_train,
    x_val,
    y_val,
    train_params,
)
lgb_iteration = lgb_model.best_iteration_


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.587338	valid_0's binary_logloss: 0.682014
Train score: 0.6341
Validation score: 0.5873


### Voting


In [11]:
weights = [0.9, 0.1]

voting_clf = VotingClassifier(
    estimators=[
        ("cb", cb.CatBoostClassifier(**cb_params, iterations=cb_iteration)),
        ("lgb", lgb.LGBMClassifier(**lgb_params, n_estimators=lgb_iteration)),
    ],
    voting="soft",
    weights=weights,
)

voting_clf, voting_timestamp = trainer(voting_clf, x_train, y_train, x_val, y_val)


Train score: 0.7386
Validation score: 0.5994


## Output


In [12]:
y_pred = voting_clf.predict_proba(testset.x)[:, 1]
output(testset.id, y_pred, voting_timestamp)


## Result

Public score: 0.58483 (7th)

Private score: 0.60140 (3rd)


## Conclusion

這次的 Competition 主要的難點是在對文字特徵的處理，尤其是對 html 的文字做處理。因為以前都沒有類似的經驗，因此除了想到有用的 Feature 之外，如何去 Parse 出這些 Feature 也是個挑戰。除此之外，Parse 完後的文字資料要如何處理也是個問題，實驗課提到的 Preprocessing 方式的表現不到非常好。雖然可能是後續沒調整好，但找到一個更好的如 Tokenize, Stemming 也花了不少時間。

不過看原本 Public score 的排名我們原本不期待在 Private 會有多高的提升，沒想到直接上升到第 3 名，非常出乎我們的意料。而且在 Private 最好的成績的預測並不是在 Public 最好的，這點也令我們感到意外。看來這次的 Private 跟 Public 的資料分布上是有一定程度的落差的。
