# DataLab Cup 1
[Kaggle](https://www.kaggle.com/competitions/2023-datalab-cup1-predicting-news-popularity/data)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pylab import *
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelBinarizer
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression, Perceptron, SGDClassifier
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.metrics import mean_squared_error as MSE, r2_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

from bs4 import BeautifulSoup as BSoup

nltk.download('stopwords')
stopWords = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
train_R = pd.read_csv('/content/drive/MyDrive/deep learn/comp1/train.csv')
test_R = pd.read_csv('/content/drive/MyDrive/deep learn/comp1/test.csv')

In [4]:
valid_R = train_R[8000:]
train_R = train_R[:8000]

In [5]:
print(train_R.shape)
print(test_R.shape)
print(train_R.head())
print(test_R.head())
print(train_R[:5])

(8000, 3)
(11847, 2)
   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...
      Id                                       Page content
0  27643  <html><head><div class="article-info"><span cl...
1  27644  <html><head><div class="article-info"><span cl...
2  27645  <html><head><div class="article-info"><span cl...
3  27646  <html><head><div class="article-info"><span cl...
4  27647  <html><head><div class="article-info"><span cl...
   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl..

In [6]:
print(train_R.iloc[:3, 2])

0    <html><head><div class="article-info"> <span c...
1    <html><head><div class="article-info"><span cl...
2    <html><head><div class="article-info"><span cl...
Name: Page content, dtype: object


In [None]:
# html tag V
# emoticons V
# link: Space.com
# date : 2013-06-19
# time: 15:04:30 UTC
# topic: Topics: Asteroid, Asteroids, challenge, Earth, Space, U.S., World
# lengths

def preprocess(text):
    text = BSoup(text, 'html.parser').get_text()

    reg = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emo = re.findall(reg, text)
    text = re.sub(reg, '', text)

    text = re.sub('[\W]+', " ", text.lower()) + " " + ' '.join(emo).replace('-', '')
    return text

def tokenizer(text):
    return re.split('\s+', text.strip())

def tokenizer_stem_noStopWords(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip())
            if w not in stopWords and re.match('[a-zA-Z]+', w)]

In [None]:
print(tokenizer(preprocess(train_R.iloc[0, 2])))

['clara', 'moskowitz', 'for', 'space', 'com', '2013', '06', '19', '15', '04', '30', 'utc', 'nasa', 's', 'grand', 'challenge', 'stop', 'asteroids', 'from', 'destroying', 'earth', 'there', 'may', 'be', 'killer', 'asteroids', 'headed', 'for', 'earth', 'and', 'nasa', 'has', 'decided', 'to', 'do', 'something', 'about', 'it', 'the', 'space', 'agency', 'announced', 'a', 'new', 'grand', 'challenge', 'on', 'june', '18', 'to', 'find', 'all', 'dangerous', 'space', 'rocks', 'and', 'figure', 'out', 'how', 'to', 'stop', 'them', 'from', 'destroying', 'our', 'planet', 'the', 'new', 'mission', 'builds', 'on', 'projects', 'already', 'underway', 'at', 'nasa', 'including', 'a', 'plan', 'to', 'capture', 'an', 'asteroid', 'pull', 'it', 'in', 'toward', 'the', 'moon', 'and', 'send', 'astronauts', 'to', 'visit', 'it', 'as', 'part', 'of', 'the', 'grand', 'challenge', 'the', 'agency', 'issued', 'a', 'request', 'for', 'information', 'today', 'aiming', 'to', 'solicit', 'ideas', 'from', 'industry', 'academia', 'and

In [None]:
# train0 = train_R.to_numpy()
# test0 = test_R.to_numpy()

In [None]:
# print(train0[:5])

In [None]:
Z_train = train_R.iloc[:50]
Z_valid = valid_R.iloc[:50]
Z_test = test_R.iloc[:50]

In [None]:
pipe1 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1),
                        preprocessor=preprocess,
                        tokenizer=tokenizer_stem_noStopWords)), ('clf', LogisticRegression(solver='liblinear'))])

# scores = cross_val_score(estimator=pipe1, X=Z_train['Page content'], y=Z_train['Popularity'],
#                          cv=10, scoring='roc_auc')

pipe1.fit(train_R['Page content'], train_R["Popularity"])
scores = pipe1.score(valid_R['Page content'], valid_R["Popularity"])
valid_R_pred = pipe1.predict(valid_R['Page content'])
print(valid_R_pred[:10], '---', valid_R.iloc[:10, 1])


print(scores)
# print(scores.mean())
# print(scores.std())

[-1  1  1 -1  1 -1 -1 -1 -1  1] --- 8000   -1
8001   -1
8002   -1
8003    1
8004   -1
8005    1
8006   -1
8007   -1
8008   -1
8009   -1
Name: Popularity, dtype: int64
0.5248689100442906


### Result

predict the testing data and save it as csv file

In [None]:
test_R_pred = pipe1.predict(test_R['Page content'])


In [None]:
print(test_R_pred[:10], '---\n', test_R.iloc[:10])

In [None]:
df = pd.DataFrame(test_R_pred, columns=["Popularity"])
df["Id"] = test_R['Id']
df = df.iloc[:, [1, 0]]
df.head()

Unnamed: 0,Id,Popularity
0,27643,1
1,27644,1
2,27645,-1
3,27646,-1
4,27647,-1


In [None]:
# df = pd.DataFrame(data=test_R_pred.reshape(test_R_pred.shape[0], 1), columns=['Popularity'])
df.to_csv('./test_pred.csv', index=False)