In [3]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime

## Scraper

In [16]:
# Define the keywords of interest
keywords = [
    "Asian", "hispanic", "latinx", "diversity", "equity", "inclusion", "equality", 
    "poc", "lgbtqia", "queer", "middle eastern", "black", "gay", "transgender", 
    "bisexual", "lesbian", "pansexual", "asexual", "homosexual", "nonbinary", 
    "intersex", "aromantic", "cisgender", "coming out", "genderfluid", "privilege", 
    "pronouns", "undocumented", "neurodiversity", "neurodivergent", "disability", 
    "microaggression", "identity", "discrimination", "culture", "misgendering"
]

# Function to fetch the HTML content of a URL
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch URL {url}: {e}")
        return None

# Function to extract article URLs from a page
def extract_article_urls(page_html):
    soup = BeautifulSoup(page_html, 'html.parser')
    article_tags = soup.find_all('div', class_='category-page-post-text')
    article_urls = [tag.find('a')['href'] for tag in article_tags if tag.find('a')]
    return list(set(article_urls))

def clean_content(content):
    marker = "Print Comments are closed."
    if (idx := content.find(marker)) != -1:
        content = content[:idx]
    
    # Replace "â€œ" with opening quotes and "â€" with closing quotes
    content = content.replace("â€œ", "\"").replace("â€™", "\'").replace("â€", "\"").replace("Â", "")
    return content

# Function to extract article content
def extract_article_content(article_html):
    soup = BeautifulSoup(article_html, 'html.parser')
    title_tag = soup.find('title')
    date_tag = soup.find('div', class_='single-post-byline')
    title = title_tag.text.strip() if title_tag else 'No title found'
    date_info = date_tag.text.strip() if date_tag else 'No date found'
    
    date, time, authors = parse_date_info(date_info)
    
    content = ' '.join([p.text for p in soup.find_all('p')])
    content = clean_content(content)
    return title, date, time, authors, content

# Function to parse date information
def parse_date_info(date_info):
    date_time_author_pattern = re.compile(r"([A-Za-z]+\s\d{1,2},\s\d{4})\s+at\s+(\d{1,2}:\d{2}\s[apm]{2})\s+by\s+(.+)")
    match = date_time_author_pattern.search(date_info)
    
    if match:
        date = match.group(1)
        time = match.group(2)
        authors_str = match.group(3)
        authors = [author.strip() for author in re.split(r',\s*|\band\b', authors_str)]
        return date, time, authors
    else:
        return 'No date found', 'No time found', ['No author found']

# Function to check if the article content contains any of the keywords
def contains_keywords(content, keywords):
    content_lower = content.lower()
    for keyword in keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', content_lower):
            return True
    return False

# Function to get all article URLs by paginating
def get_all_article_urls(base_url, max_pages=45):
    page = 1
    all_article_urls = []
    while page <= max_pages:
        page_url = f"{base_url}page/{page}/"
        if page % 10 == 0:
            print(f"Fetching {page_url}...")
        page_html = fetch_html(page_url)
        if page_html:
            article_urls = extract_article_urls(page_html)
            if not article_urls:
                break  # No more articles found, exit the loop
            all_article_urls.extend(article_urls)
            page += 1
        else:
            break
    return list(set(all_article_urls))

# Function to filter articles by date
def filter_by_date(date_str, start_date):
    try:
        article_date = datetime.strptime(date_str, '%B %d, %Y')
        return article_date >= start_date
    except ValueError:
        return False

# Scrape articles from the sports section
def scrape_section(entry):
    base_url = f"https://dailynexus.com/category/{entry}/"
    
    article_urls = get_all_article_urls(base_url)
    print(f"Found {len(article_urls)} articles in section {entry}.")

    start_date = datetime(2018, 1, 1)
    articles = []

    count = 1

    # Process each article and collect data
    for article_url in article_urls:
        article_html = fetch_html(article_url)
        if article_html:
            title, date, time, authors, content = extract_article_content(article_html)
            if filter_by_date(date, start_date):
                articles.append({
                    'Section': entry,
                    'Title': title[0:-18],
                    'Date': date,
                    'Time': time,
                    'Authors': authors,
                    'URL': article_url,
                    'Content': content
                })
        else:
            print(f"Failed to retrieve article: {article_url}")
        print(count)
        count += 1

    return articles

# Define the header for the CSV file
header = ['Section', 'Title', 'Date', 'Time', 'Authors', 'URL', 'Content']

# Scrape the sports section and collect all data
articles = scrape_section('la-vista')

print(f"Scraping completed. Total articles written: {len(articles)}")


Found 116 articles in section la-vista.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
Scraping completed. Total articles written: 116


In [17]:
pd.DataFrame(articles).to_csv('la-vista.csv', index=False)

## Additional Missed Articles (Only if Missed)

In [None]:
# Define the keywords of interest
keywords = [
    "asian", "hispanic", "latinx", "diversity", "equity", "inclusion", "equality", 
    "poc", "lgbtqia", "queer", "middle eastern", "black", "gay", "transgender", 
    "bisexual", "lesbian", "pansexual", "asexual", "homosexual", "nonbinary", 
    "intersex", "aromantic", "cisgender", "coming out", "genderfluid", "privilege", 
    "pronouns", "undocumented", "neurodiversity", "neurodivergent", "disability", 
    "microaggression", "identity", "discrimination", "culture", "misgendering"
]

# Function to fetch the HTML content of a URL
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch URL {url}: {e}")
        return None

# Function to extract article URLs from a page
def extract_article_urls(page_html):
    soup = BeautifulSoup(page_html, 'html.parser')
    article_tags = soup.find_all('div', class_='category-page-post-text')
    article_urls = [tag.find('a')['href'] for tag in article_tags if tag.find('a')]
    return list(set(article_urls))

def clean_content(content):
    marker = "Print Comments are closed."
    if (idx := content.find(marker)) != -1:
        content = content[:idx]
    
    # Replace "â€œ" with opening quotes and "â€" with closing quotes
    content = content.replace("â€œ", "\"").replace("â€", "\"").replace("â€™", "\'")
    
    return content

# Function to extract article content
def extract_article_content(article_html):
    soup = BeautifulSoup(article_html, 'html.parser')
    title_tag = soup.find('title')
    date_tag = soup.find('div', class_='single-post-byline')
    title = title_tag.text.strip() if title_tag else 'No title found'
    date_info = date_tag.text.strip() if date_tag else 'No date found'
    
    date, time, authors = parse_date_info(date_info)
    
    content = ' '.join([p.text for p in soup.find_all('p')])
    content = clean_content(content)
    return title, date, time, authors, content

# Function to parse date information
def parse_date_info(date_info):
    date_time_author_pattern = re.compile(r"([A-Za-z]+\s\d{1,2},\s\d{4})\s+at\s+(\d{1,2}:\d{2}\s[apm]{2})\s+by\s+(.+)")
    match = date_time_author_pattern.search(date_info)
    
    if match:
        date = match.group(1)
        time = match.group(2)
        authors_str = match.group(3)
        authors = [author.strip() for author in re.split(r',\s*|\band\b', authors_str)]
        return date, time, authors
    else:
        return 'No date found', 'No time found', ['No author found']

# Function to check if the article content contains any of the keywords
def contains_keywords(content, keywords):
    content_lower = content.lower()
    for keyword in keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', content_lower):
            return True
    return False

# Function to get all article URLs by paginating
def get_all_article_urls(base_url, max_pages=161):
    page = 1
    all_article_urls = []
    while page <= max_pages:
        page_url = f"{base_url}page/{page}/"
        if page % 10 == 0:
            print(f"Fetching {page_url}...")
        page_html = fetch_html(page_url)
        if page_html:
            article_urls = extract_article_urls(page_html)
            if not article_urls:
                break  # No more articles found, exit the loop
            all_article_urls.extend(article_urls)
            page += 1
        else:
            break
    return list(set(all_article_urls))

# Function to filter articles by date
def filter_by_date(date_str, start_date):
    try:
        article_date = datetime.strptime(date_str, '%B %d, %Y')
        return article_date >= start_date
    except ValueError:
        return False

# Scrape articles from the sports section
def scrape_section(entry):
    base_url = f"https://dailynexus.com/category/{entry}/"
    
    article_urls = ['https://dailynexus.com/2021-03-11/report-it-rained/']

    start_date = datetime(2018, 1, 1)
    articles = []

    count = 1

    # Process each article and collect data
    for article_url in article_urls:
        article_html = fetch_html(article_url)
        if article_html:
            title, date, time, authors, content = extract_article_content(article_html)
            if filter_by_date(date, start_date):
                articles.append({
                    'Section': entry,
                    'Title': title[0:-18],
                    'Date': date,
                    'Time': time,
                    'Authors': authors,
                    'URL': article_url,
                    'Content': content
                })
        else:
            print(f"Failed to retrieve article: {article_url}")
        print(count)
        count += 1

    return articles

# Define the header for the CSV file
header = ['Section', 'Title', 'Date', 'Time', 'Authors', 'URL', 'Content']

# Scrape the section and collect all data
articles = scrape_section('daily-stench')

print(f"Scraping completed. Total articles written: {len(articles)}")


1
Scraping completed. Total articles written: 1


In [None]:
# append entry in articles to section.csv
with open('daily-stench.csv', 'a', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=header)
    for article in articles:
        writer.writerow(article)
