In [16]:
import pandas as pd
import newspaper
from newspaper import Article
from datetime import datetime
import time
import requests
from urllib.parse import urljoin, urlparse
import logging
import os
import glob


In [25]:
df = pd.read_excel("df_init.xlsx")
df.shape


(16, 3)

In [21]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\panla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:

## spacy test

import spacy

nlp = spacy.load("en_core_web_sm")
text = "Hello world. How are you today?"

doc = nlp(text)
sentences = [sent.text for sent in doc.sents]

print(sentences)


['Hello world.', 'How are you today?']


In [None]:
from collections import Counter

# Loading spaCy
nlp = spacy.load("en_core_web_sm")

# Function for extracting simple keywords via spaCy (most frequent tokens without stopwords)
def extract_keywords_spacy(text, top_n=10):
    doc = nlp(text.lower())
    words = [token.text for token in doc if token.is_alpha and not token.is_stop]
    most_common = Counter(words).most_common(top_n)
    return [word for word, _ in most_common]


## Create columns
df['title'] = None
df['publish_date'] = None
df['authors'] = None
df['scraped_at'] = None
df['keywords'] = None
df['text'] = None
df['text_length'] = None

# Scraping loop
for index, row in df.iterrows():
    url = row['url']
    article = Article(
        url=url,
        language='en',
        browser_user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    )

    try:
        article.download()
        article.parse()

        df.loc[index, 'title'] = article.title
        df.loc[index, 'publish_date'] = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') if article.publish_date else None
        df.loc[index, 'authors'] = ', '.join(article.authors) if article.authors else None
        df.loc[index, 'scraped_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        df.loc[index, 'text'] = article.text
        df.loc[index, 'text_length'] = len(article.text)

        # Extract keywords with spaCy
        if article.text:
            keywords = extract_keywords_spacy(article.text, top_n=10)
            df.loc[index, 'keywords'] = ', '.join(keywords)
        else:
            df.loc[index, 'keywords'] = None

    except Exception as e:
        print(f"⚠️ Unsuccessful scraping at index {index} : {e}")
        continue

# Save
os.makedirs('output', exist_ok=True)
output_path = 'output/articles_scraped.xlsx'
df.to_excel(output_path, index=False)
print(f"✅ Results saved in {output_path}")


In [29]:
df = df.drop(columns=["date_published", "headline"])

In [31]:
df.head()

Unnamed: 0,url,title,publish_date,authors,scraped_at,keywords,text,text_length
0,https://retailwire.com/discussion/early-summer...,Is Early Summer Becoming a Bigger Retail Sales...,2025-07-03 13:36:50,"Craig Sundstrom, Mohamed Amer, Alex Walderman,...",2025-07-05 12:33:35,"summer, july, prices, dick, announced, retaile...","July 3, 2025\n\nRetailers ranging from Dollar ...",2826
1,https://retailwire.com/discussion/trader-joes-...,Should Trader Joe’s Open Stores Next To Each O...,2025-07-01 16:30:00,"Craig Sundstrom, Pamela Kaplan, Kai Clarke, Mo...",2025-07-05 12:33:36,"location, trader, joe, store, new, opened, str...","July 1, 2025\n\nTrader Joe’s has opened a stor...",2344
2,https://retailwire.com/discussion/how-retailer...,How Can Retailers Best Attract a Growing Cohor...,2025-07-01 16:00:00,"Craig Sundstrom, Mohamed Amer, Frank Margolis,...",2025-07-05 12:33:36,"value, brands, consumers, deloitte, quality, i...","July 1, 2025\n\nWhen it comes to the concept o...",3624
3,https://www.retaildive.com/news/amazon-stands-...,Customers are ditching Shein and Temu. Can Ama...,2025-07-01 09:00:00,Laurel Deppen,2025-07-05 12:33:38,"temu, shein, consumer, said, amazon, spending,...",Listen to the article 6 min This audio is auto...,5662
4,https://www.retaildive.com/news/retail-trends-...,8 retail trends to watch in 2025,2025-01-07 14:00:00,Retail Dive Staff,2025-07-05 12:33:40,"retail, retailers, new, year, continue, said, ...",Listen to the article 15 min This audio is aut...,13522
