# DataLab Cup 1 
[Kaggle](https://www.kaggle.com/competitions/2023-datalab-cup1-predicting-news-popularity/data)

## Define Imports

In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import re
from sklearn.preprocessing import MinMaxScaler
from bs4 import BeautifulSoup


## Define Folder Output

In [43]:
os.makedirs("./output", exist_ok=True)

## Data Visualization

In [44]:
df = pd.read_csv('./dataset/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [45]:
df.shape

(27643, 3)

In [46]:
unique, counts = np.unique(df['Popularity'].values, return_counts=True)
dict(zip(unique, counts))

{-1: 14011, 1: 13632}

In [47]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace ('-', '') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')
    return text

print('Before:', df.iloc[0]['Page content'])
processed_txt = preprocessor(df.iloc[0]['Page content'])
print('After:', processed_txt)

Before: <html><head><div class="article-info"> <span class="byline basic">Clara Moskowitz</span> for <a href="/publishers/space-com/">Space.com</a> <time datetime="Wed, 19 Jun 2013 15:04:30 +0000">2013-06-19 15:04:30 UTC</time> </div></head><body><h1 class="title">NASA's Grand Challenge: Stop Asteroids From Destroying Earth</h1><figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/></figure><article data-channel="world"><section class="article-content"> <p>There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure o

Check what the data looks like

In [48]:
with open("./output/out.txt",'w') as f:
    print(BeautifulSoup(df.iloc[0]['Page content'], 'html.parser').prettify(), file=f)

In [49]:
with open("./output/out2.txt",'w') as f:
    print(BeautifulSoup(df.iloc[1]['Page content'], 'html.parser').prettify(), file=f)

In [50]:
with open("./output/out3.txt",'w') as f:
    print(BeautifulSoup(df.iloc[300]['Page content'], 'html.parser').prettify(), file=f)

# Preprocess Data

## Split X and y data

In [51]:
X = df['Page content'].values[:, np.newaxis]
y = df['Popularity'].values[:, np.newaxis]
X.shape, y.shape

((27643, 1), (27643, 1))

## Scale y data from [-1, 1] to [0, 1]
Useful for logistic regression model since y has to be [0, 1]

In [52]:
min, max = 0, 1
y_std = (y - y.min(axis=0)) / (y.max(axis=0) - y.min(axis=0))
y = y_scaled = y_std * (max - min) + min
y

array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [1.]])

## Feature Extraction from X 