# Competition 1 Predicting News Popularity


## Load Package


In [7]:
import datetime
import os
import re
from dataclasses import dataclass

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDRegressor

import lightgbm as lgb


In [9]:
# Some CONSTANTS
RANDOM_STATE = 42
INPUT_DIR = "./input/"
OUTPUT_DIR = "./output/"
MODEL_SAVE_DIR = "./model_saves/"

# Create directories if not exist
if not os.path.exists(INPUT_DIR):
    os.makedirs(INPUT_DIR)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

# Download nltk stopwords
nltk.download("stopwords")
STOP = stopwords.words("english")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huangmorris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Input & Output module


In [3]:
def input(chunksize: int = 1000, val_size: float = 0.2):
    """
    讀取輸入數據，並返回一個生成器和一個測試數據集。

    Args:
        chunksize: int，每次讀取的 chunk 大小。默認為 1000。
        val_size: float，驗證集的比例。默認為 0.2。

    Returns:
        stream: generator，用於進行 Out-of-Core learning 時，所使用的 stream generator。詳細參考教學 Notebook 的 Out-of-Core 環節。
        testset: Dataclass: 包含了 test feature 和 ID。
    """

    train_path = os.path.join(INPUT_DIR, "train.csv")
    test_path = os.path.join(INPUT_DIR, "test.csv")

    # 用於進行 Out-of-Core learning 時，所使用的 stream generator
    def get_stream(train_path, chunksize, val_size=0.2):
        @dataclass
        class TrainValSet:
            x_train: pd.Series
            x_val: pd.Series
            y_train: pd.Series
            y_val: pd.Series

        # 將資料依照 validation size 分成 train/validataion
        for chunk in pd.read_csv(train_path, chunksize=chunksize):
            train_val = train_test_split(
                chunk["Page content"],
                chunk["Popularity"],
                test_size=val_size,
                random_state=RANDOM_STATE,
            )

            # 重設 x_trian, x_val, y_train, y_val 的 index
            train_val = [x.reset_index() for x in train_val]
            yield TrainValSet(*train_val)

    df_test = pd.read_csv(test_path)
    x_test = df_test["Page content"]
    id_test = df_test["Id"]

    @dataclass
    class TestSet:
        x: pd.Series
        id: pd.Series

    return get_stream(train_path, chunksize, val_size), TestSet(x_test, id_test)


def output(id_test: pd.Series, y_pred: np.ndarray):
    """
    將預測結果寫入 `OUTPUT_DIR` 資料夾中，並以當前時間命名

    Args:
        id_test: Pandas Series 包含 test data 的 id
        y_pred: NumPy array 包含預測結果（0d/1d）

    Returns:
        None
    """
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_filename = f"output_{timestamp}.csv"
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    output_df = pd.DataFrame({"Id": id_test.ravel(), "Popularity": y_pred})
    output_df.to_csv(output_path, index=False)


## Feature Enignnering


### Data cleaning

目前基本上是教學 Notebook 中的內容。


In [4]:
def text_cleaner(text: str):
    """
    清理文本數據，去除 HTML 標籤和表情符號，並將文本轉換為小寫字母。

    Args:
        text: str，需要清理的文本數據。

    Returns:
        str，清理後的文本數據。
    """

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").text

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = r"(?::|;|=|X)(?:-)?(?:\)|\(|D|P)"
    emoticons = re.findall(r, text)
    text = re.sub(r, "", text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = (
        re.sub(r"[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    )
    return text


def tokenizer_stem_nostop(text):
    """
    對文本進行分詞和詞幹提取。

    Args:
        text: str，需要進行分詞和詞幹提取的文本數據。

    Returns:
        list，包含了文本數據中的詞幹。
    """

    porter = PorterStemmer()
    return [
        porter.stem(w)
        for w in word_tokenize(text, preserve_line=True)
        if w not in STOP and re.match("[a-zA-Z]+", w)
    ]


### 調用函式

用於更輕鬆的調用之後新增的比如：加新特徵或其他前處理的 Function。

還有很多要調整。


In [19]:
@dataclass
class FeatureParams:
    hash_feature_num: int = 2**10


def get_features(feature_params: FeatureParams, texts: pd.Series):
    """
    對文本數據進行特徵工程，返回一包含轉換的基本特徵及生成的各種特徵的 DataFrame。

    Args:
        feature_params: FeatureParams，特徵參數。
        texts: pd.Series，文本數據。

    Returns:
        pd.DataFrame
    """

    # 基本的 HashingVectorizer 轉換
    hash_vectorizer = HashingVectorizer(
        n_features=feature_params.hash_feature_num,
        preprocessor=text_cleaner,
        tokenizer=tokenizer_stem_nostop,
    )

    df = pd.DataFrame(
        hash_vectorizer.fit_transform(texts).toarray(),
        columns=["v" + str(i) for i in range(feature_params.hash_feature_num)],
    )

    # TODO: 增加額外的特徵工程

    return df


## Out-of-Core learning


In [20]:
stream, testset = input()
dataset = next(stream)
feature_params = FeatureParams(hash_feature_num=2**10)
features_train = get_features(feature_params, dataset.x_train.loc[:10, "Page content"])
features_val = get_features(feature_params, dataset.x_val.loc[:10, "Page content"])




In [10]:
save_path = os.path.join(MODEL_SAVE_DIR, "model.txt")


In [8]:
model = lgb.sklearn.LGBMClassifier()
model.fit(
    features_train["vectorized_features"].toarray(),
)

model.booster_.save_model(save_path)


lightgbm.sklearn.LGBMClassifier