# Competition 1 Predicting News Popularity


## Load Package


In [4]:
import datetime
import os
import re
from dataclasses import dataclass

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


In [5]:
# Some CONSTANTS
RANDOM_STATE = 42
INPUT_DIR = "./input/"
OUTPUT_DIR = "./output/"

# Download nltk stopwords
nltk.download("stopwords")
STOP = stopwords.words("english")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huangmorris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Input & Output module


In [32]:
def input():
    """
    從 `INPUT_DIR` 讀取 input data 並回傳 Dataset 物件

    Returns:
        Dataset: 包含 train/val/test 的物件
    """

    train_path = os.path.join(INPUT_DIR, "train.csv")
    test_path = os.path.join(INPUT_DIR, "test.csv")

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # 將資料依照 80/20 分成 train/validataion
    train_dataset = train_test_split(
        train_df["Page content"],
        train_df["Popularity"],
        test_size=0.2,
        random_state=RANDOM_STATE,
    )

    x_test = pd.DataFrame(test_df["Page content"])
    id_test = test_df["Id"]

    train_dataset = [pd.DataFrame(x.reset_index()) for x in train_dataset]

    @dataclass
    class Dataset:
        x_train: pd.DataFrame
        x_val: pd.DataFrame
        y_train: pd.DataFrame
        y_val: pd.DataFrame
        x_test: pd.DataFrame
        id_test: pd.Series

    return Dataset(*train_dataset, x_test, id_test)


def output(id_test: pd.Series, y_pred: np.ndarray):
    """
    將預測結果寫入 `OUTPUT_DIR` 資料夾中，並以當前時間命名

    Args:
        id_test: Pandas Series 包含 test data 的 id
        y_pred: NumPy array 包含預測結果（0d/1d）

    Returns:
        None
    """
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_filename = f"output_{timestamp}.csv"
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    output_df = pd.DataFrame({"Id": id_test.ravel(), "Popularity": y_pred})
    output_df.to_csv(output_path, index=False)


## Feature Enignnering


### Data cleaning

目前基本上是教學 Notebook 中的內容。


In [7]:
def text_cleaner(text: str):
    """
    清理文本數據，去除 HTML 標籤和表情符號，並將文本轉換為小寫字母。

    Args:
        text: str，需要清理的文本數據。

    Returns:
        str，清理後的文本數據。
    """

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").text

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = r"(?::|;|=|X)(?:-)?(?:\)|\(|D|P)"
    emoticons = re.findall(r, text)
    text = re.sub(r, "", text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = (
        re.sub(r"[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    )
    return text


def tokenizer_stem_nostop(text):
    """
    對文本進行分詞和詞幹提取。

    Args:
        text: str，需要進行分詞和詞幹提取的文本數據。

    Returns:
        list，包含了文本數據中的詞幹。
    """

    porter = PorterStemmer()
    return [
        porter.stem(w)
        for w in word_tokenize(text, preserve_line=True)
        if w not in STOP and re.match("[a-zA-Z]+", w)
    ]


### Bow (Bag-Of-Words)


In [33]:
dataset = input()


In [None]:
count = CountVectorizer(
    ngram_range=(1, 1), preprocessor=text_cleaner, tokenizer=tokenizer_stem_nostop
)
doc_bag = count.fit_transform(dataset.x_train["Page content"].values)
doc_bag.toarray()
