# Competition 1 Predicting News Popularity


## Load Package


In [95]:
import datetime
import os
import re
from dataclasses import dataclass, asdict
import joblib
import warnings
import mmap

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import scipy as sp
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    HalvingRandomSearchCV,
)
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    HashingVectorizer,
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer

import lightgbm as lgb
import xgboost as xgb
import catboost as cb


In [2]:
# Some CONSTANTS
RANDOM_STATE = 42
INPUT_DIR = "./input/"
OUTPUT_DIR = "./output/"
MODEL_SAVE_DIR = "./model_saves/"

# Create directories if not exist
if not os.path.exists(INPUT_DIR):
    os.makedirs(INPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

# Download nltk stopwords
nltk.download("stopwords")
STOP = stopwords.words("english")
nltk.download("wordnet")

# Stop warnings
warnings.filterwarnings("ignore")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matcha0714/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/matcha0714/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Input & Output module


In [210]:
def input(chunksize: int = 1000, val_size: float = 0.2, stream: bool = False):
    train_path = os.path.join(INPUT_DIR, "train.csv")
    test_path = os.path.join(INPUT_DIR, "test.csv")

    chunksize = (
        chunksize if stream else get_file_len(os.path.join(INPUT_DIR, "train.csv"))
    )

    df_test = pd.read_csv(test_path)
    x_test = df_test["Page content"]
    id_test = df_test["Id"]

    @dataclass
    class TestSet:
        x: pd.Series
        id: pd.Series

    return_item = (
        (get_stream(train_path, chunksize, val_size), TestSet(x_test, id_test))
        if stream
        else (
            next(get_stream(train_path, chunksize, val_size)),
            TestSet(x_test, id_test),
        )
    )

    return return_item


# 用於進行 Out-of-Core learning 時，所使用的 stream generator
def get_stream(train_path, chunksize, val_size=0.2):
    @dataclass
    class Dataset:
        x: pd.Series
        y: pd.Series

    # 將資料依照 validation size 分成 train/validataion
    for chunk in pd.read_csv(train_path, chunksize=chunksize):
        x = chunk["Page content"]
        y = chunk["Popularity"]

        yield Dataset(x, y)


def get_file_len(path):
    # 用於得到檔案的行數
    with open(path, "rb") as f:
        buf = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
        lines = 0
        while buf.readline():
            lines += 1
        buf.close()

        # Remove column rows
        return lines


In [242]:
def output(id_test: pd.Series, y_pred: np.ndarray, info: str = None):
    if info == None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    else:
        timestamp = info

    output_filename = f"output_{timestamp}.csv"
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    output_df = pd.DataFrame({"Id": id_test.ravel(), "Popularity": y_pred})
    output_df.to_csv(output_path, index=False)


## Feature Enignnering


In [133]:
def text_cleaner(self, text: str):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").text

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = r"(?::|;|=|X)(?:-)?(?:\)|\(|D|P)"
    emoticons = re.findall(r, text)
    text = re.sub(r, "", text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = (
        re.sub(r"[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    )
    return text


def tokenizer(self, text):
    text = re.sub("([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text)
    text = re.sub("\.", "", text)
    text = re.sub("[^\w]+", " ", text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(s) for s in re.split("\s+", text.strip())]


def get_title(self, soup_texts):
    return pd.DataFrame(
        soup_texts.apply(lambda x: x.body.h1.string.strip().lower()).rename("title")
    )


def get_topic(self, soup_texts):
    def helper(text):
        a_list = text.body.find("footer", {"class": "article-topics"}).find_all("a")
        topics = [re.sub("\s+", "-", a.string.strip().lower()) for a in a_list]
        return " ".join(topics)

    return pd.DataFrame(soup_texts.apply(helper).rename("topic"))


def get_datetime(self, soup_texts):
    def helper(text):
        try:
            datetime_str = text.time["datetime"]
        except:
            datetime_str = "Thu, 01 Jan 2014 00:00:00 +0000"

        datetime_obj = datetime.datetime.strptime(
            datetime_str, "%a, %d %b %Y %H:%M:%S %z"
        )

        return pd.Series(
            {
                "year": datetime_obj.year,
                "month": datetime_obj.month,
                "day": datetime_obj.day,
                "hour": datetime_obj.hour,
                "minute": datetime_obj.minute,
                "second": datetime_obj.second,
            },
        )

    return pd.DataFrame(soup_texts.apply(helper))


def get_content_length(self, soup_texts):
    def helper(text):
        content = text.find("section", class_="article-content").get_text()
        return len(content)

    return pd.DataFrame(soup_texts.apply(helper).rename("content_length"))


def vectorize_texts(self, df: pd.DataFrame, vec_idx: list, vectorizer):
    additional_dfs = [
        pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(df.loc[:, idx]))
        for idx in vec_idx
    ]

    return pd.concat(additional_dfs, axis=1)


### 調用函式

用於更輕鬆的調用之後新增的比如：加新特徵或其他前處理的 Function。

還有很多要調整。


In [232]:
@dataclass
class FeatureParams:
    do_get_title: bool = True
    do_get_topic: bool = True
    do_get_datetime: bool = True
    do_get_content_length: bool = True

    vectorizer: str = "count"


class FeaturePreprocessor:
    def __init__(
        self,
        get_title: bool = True,
        get_topic: bool = True,
        get_datetime: bool = True,
        get_content_length: bool = True,
    ) -> None:
        self.get_title = get_title
        self.get_topic = get_topic
        self.get_datetime = get_datetime
        self.get_content_length = get_content_length

    __text_cleaner = text_cleaner
    __tokenizer = tokenizer
    __get_title = get_title
    __get_topic = get_topic
    __get_datetime = get_datetime
    __get_content_length = get_content_length

    def __add_feature(self, original_df, additional_df):
        return (
            additional_df
            if type(original_df) != pd.DataFrame
            else pd.concat([original_df, additional_df], axis=1)
        )

    def __get_feature(self, texts):
        df = None
        vec_idx = []

        soup_texts = texts.apply(lambda x: BeautifulSoup(x, "html.parser"))

        if self.get_title:
            df = self.__add_feature(df, self.__get_title(soup_texts))
            vec_idx += ["title"]

        if self.get_topic:
            df = self.__add_feature(df, self.__get_topic(soup_texts))
            vec_idx += ["topic"]

        if self.get_datetime:
            df = self.__add_feature(df, self.__get_datetime(soup_texts))

        if self.get_content_length:
            df = self.__add_feature(df, self.__get_content_length(soup_texts))

        print(f"Features: {df.columns.tolist()}")

        return df, vec_idx

    def fit_transform(self, texts_train):
        df, vec_idx = self.__get_feature(texts_train)

        if len(vec_idx) != 0:
            self.trans = ColumnTransformer(
                [
                    (
                        idx,
                        CountVectorizer(tokenizer=self.__tokenizer, lowercase=False),
                        idx,
                    )
                    for idx in vec_idx
                ],
                remainder="drop",
                sparse_threshold=0.0,
            )

            addtional_df = pd.DataFrame(self.trans.fit_transform(df))
            df = self.__add_feature(df, addtional_df)

        return df.drop(columns=vec_idx, inplace=False)

    def transform(self, texts):
        df, vec_idx = self.__get_feature(texts)

        if len(vec_idx) != 0:
            additional_df = pd.DataFrame(self.trans.transform(df))
            df = self.__add_feature(df, additional_df)

        return df.drop(columns=vec_idx, inplace=False)


## Training


In [233]:
dataset, testset = input(stream=False)
preprocessor = FeaturePreprocessor(get_title=False)

dataset.x = preprocessor.fit_transform(dataset.x)
testset.x = preprocessor.transform(testset.x)
dataset.y = dataset.y.replace(-1, 0)

dataset.x


Features: ['topic', 'year', 'month', 'day', 'hour', 'minute', 'second', 'content_length']
Features: ['topic', 'year', 'month', 'day', 'hour', 'minute', 'second', 'content_length']


Unnamed: 0,year,month,day,hour,minute,second,content_length,0,1,2,...,9859,9860,9861,9862,9863,9864,9865,9866,9867,9868
0,2013,6,19,15,4,30,3591,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013,3,28,17,40,55,1843,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014,5,7,19,15,20,6646,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,10,11,2,26,50,1821,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,4,17,3,31,43,8919,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27638,2014,4,8,16,26,31,1776,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27639,2014,7,9,1,3,24,1890,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27640,2014,7,10,12,30,13,1274,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27641,2013,4,16,20,49,16,2657,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [234]:
# CatBoost
x_train, x_val, y_train, y_val = train_test_split(
    dataset.x, dataset.y, test_size=0.2, random_state=RANDOM_STATE
)


cat_features = [0, 1, 2, 3, 4, 5]

pool = cb.Pool(dataset.x, dataset.y, cat_features=cat_features)

cb_params = {
    "l2_leaf_reg": 2.5,
    "random_strength": 0.8,
}

cb_params = {
    "" "depth": 10,
    "learning_rate": 0.04,
    "l2_leaf_reg": 3,
    "loss_function": "CrossEntropy",
    "border_count": 254,
    "od_pval": 0.01,
    "od_type": "IncToDec",
    "thread_count": -1,
    "random_strength": 1.2,
    "eval_metric": "AUC",
    "num_trees": 2200,
    "random_state": RANDOM_STATE,
    "bootstrap_type": "MVS",
    "mvs_reg": 0.3,
    # "bagging_temperature": 1.2,
}

# cv_result = cb.cv(
#     pool,
#     cb_params,
#     num_boost_round=2200,
#     fold_count=4,
#     plot=True,
#     verbose=False,
#     seed=RANDOM_STATE,
#     return_models=False,
# )


In [208]:
test_mean = cv_result["test-AUC-mean"].mean()
test_std = cv_result["test-AUC-std"].mean()

print(f"Test AUC: {test_mean:.4f} +/- {test_std:.4f}")


Test AUC: 0.5919 +/- 0.0047


In [238]:
clf = cb.to_classifier(cb.CatBoost(cb_params))
clf.fit(
    x_train, y_train, cat_features=cat_features, eval_set=(x_val, y_val), verbose=True
)
val_score = roc_auc_score(y_val, clf.predict_proba(x_val)[:, 1])

timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_filename = f"cb_{timestamp}_{val_score*10000:.0f}.cbm"
save_path = os.path.join(MODEL_SAVE_DIR, model_filename)

print(f"Validation AUC: {val_score:.4f}")

clf.save_model(save_path)


0:	test: 0.5721173	best: 0.5721173 (0)	total: 12.3ms	remaining: 27s
1:	test: 0.5735014	best: 0.5735014 (1)	total: 61.5ms	remaining: 1m 7s
2:	test: 0.5795324	best: 0.5795324 (2)	total: 117ms	remaining: 1m 25s
3:	test: 0.5815422	best: 0.5815422 (3)	total: 173ms	remaining: 1m 35s
4:	test: 0.5818580	best: 0.5818580 (4)	total: 224ms	remaining: 1m 38s
5:	test: 0.5840419	best: 0.5840419 (5)	total: 289ms	remaining: 1m 45s
6:	test: 0.5807070	best: 0.5840419 (5)	total: 347ms	remaining: 1m 48s
7:	test: 0.5784088	best: 0.5840419 (5)	total: 400ms	remaining: 1m 49s
8:	test: 0.5791546	best: 0.5840419 (5)	total: 453ms	remaining: 1m 50s
9:	test: 0.5785429	best: 0.5840419 (5)	total: 503ms	remaining: 1m 50s
10:	test: 0.5768570	best: 0.5840419 (5)	total: 560ms	remaining: 1m 51s
11:	test: 0.5779178	best: 0.5840419 (5)	total: 617ms	remaining: 1m 52s
12:	test: 0.5779183	best: 0.5840419 (5)	total: 674ms	remaining: 1m 53s
13:	test: 0.5790022	best: 0.5840419 (5)	total: 736ms	remaining: 1m 54s
14:	test: 0.579300

In [244]:
clf = cb.CatBoost()
clf.load_model(fname="./model_saves/cb_20231018-181012_5932.cbm")
clf = cb.to_classifier(clf)

y_pred = clf.predict_proba(testset.x)[:, 1]

output(testset.id, y_pred, "cb_20231018-181012")

print(f"Validation AUC: {roc_auc_score(y_val, clf.predict_proba(x_val)[:, 1]):.4f}")


Validation AUC: 0.5932


In [None]:
def xgb_training(x_train, y_train):
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    model_filename = f"xgb_{timestamp}.joblib"
    model_save_path = os.path.join(MODEL_SAVE_DIR, model_filename)

    params = {
        "n_estimators": 300,
        "objective": "binary:logistic",
        "colsample_bytree": 0.3,
        "early_stopping_rounds": 10,
        "random_state": RANDOM_STATE,
    }

    params_distribution = {
        "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
        "max_depth": [3, 5, 7, 9, 11],
        "alpha": [0, 1, 3, 5, 7, 9, 10],
    }

    model = xgb.XGBClassifier(**params)

    cv = HalvingRandomSearchCV(
        model,
        params_distribution,
        factor=3,
        n_jobs=1,
        scoring="roc_auc",
        random_state=RANDOM_STATE,
    )
    cv.fit(x_train, y_train)

    print(f"Best validation score: {cv.best_score_:.4f}")

    model = cv.best_estimator_
    joblib.dump(model, model_save_path)

    return model


In [None]:
feature_params = FeatureParams(do_get_title=False)

dataset, testset = input(stream=False)
dataset.x = get_features(feature_params, dataset.x)
dataset.y = dataset.y.replace(-1, 0)
testset.x = get_features(feature_params, testset.x)

xgb_model = xgb_training(dataset.x.values, dataset.y.values)


### LightGBM
