<a href="https://colab.research.google.com/github/jonathanchau/Media-Analysis/blob/main/News_Article_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import random # Placeholder for 'Political Slant' and 'Significant Phrases'
import re # regular expressions, used in extracting author and date

In [None]:
# Function to scrape articles and pair topics from the CSV
def scrape_articles(csv_file):
    """
    Reads a CSV file with URLs and topics, scrapes article data, and pairs each URL with its topic.

    Args:
        csv_file (str): Path to the CSV file with 'URL' and 'Topic' columns.

    Returns:
        list: A list of dictionaries containing scraped article data.
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Ensure the required columns exist
    if 'URL' not in df.columns or 'Topic' not in df.columns:
        raise ValueError("CSV file must contain 'URL' and 'Topic' columns.")

    articles_data = []

    for i, row in df.iterrows():
        url = row['URL']
        topic = row['Topic']
        media_outlet = row['Media Outlet']

        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract data
                title = soup.title.string.strip() if soup.title else "No Title"
                word_count = len(soup.get_text().split())  # Approximate word count
                author = extract_author(url)["Author"]  # Use the author extraction function
                release_date = extract_publish_date(url)  # Replace with actual date extraction logic
                political_slant = round(random.uniform(-1.0, 1.0), 2)  # Replace with slant algorithm
                significant_phrases = random.randint(0, 10)  # Replace with phrases algorithm


                # Extract the main content of the article
                # Adjust the selectors based on the structure of the target webpages
                article_body = soup.find_all('p')
                article_text = " ".join(p.get_text() for p in article_body if p.get_text())

                # Create the article data dictionary
                article_data = {
                    "Article ID": i + 1,
                    "Media Outlet": media_outlet, #From CSV
                    "Title": title,
                    "URL": url,  # From CSV
                    "Word Count": word_count,
                    "Author": author,
                    "Release Date": release_date,
                    "Topic": topic,  # From CSV
                    "Political Slant": political_slant,
                    "Number of Significant Phrases": significant_phrases,
                    "Article Text": article_text
                }

                articles_data.append(article_data)
            else:
                print(f"Failed to fetch URL: {url}")
        except Exception as e:
            print(f"Error processing URL {url}: {e}")

    return articles_data

In [None]:
# Function to add rows to the existing DataFrame
def add_articles_to_dataframe(existing_df, articles):
    """
    Inserts new rows of article data into an existing DataFrame.

    Args:
        existing_df (pd.DataFrame): The preexisting DataFrame.
        articles (list/dict): List or single dictionary of article data.

    Returns:
        pd.DataFrame: Updated DataFrame with new rows added.
    """
    # Ensure articles is a list for consistency
    if isinstance(articles, dict):
        articles = [articles]

    # Convert list of dictionaries to a DataFrame and append to existing DataFrame
    new_data_df = pd.DataFrame(articles)
    updated_df = pd.concat([existing_df, new_data_df], ignore_index=True)
    return updated_df


In [None]:
def extract_author(url):
    """
    Extracts the author from an article page.
    Works for a broad range of HTML structures using general heuristics.

    Args:
        url (str): The URL of the article to scrape.

    Returns:
        dict: A dictionary containing 'Author' of the article.
    """
    try:
        # Fetch the HTML content
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch URL: {url}")
            return {"Author": "Unknown", "Topic": "Unknown"}

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- AUTHOR EXTRACTION ---
        author = None

        # Look for meta tags with 'author' keyword
        author_meta = soup.find('meta', attrs={'name': 'author'}) or soup.find('meta', attrs={'property': 'article:author'})
        if author_meta and author_meta.get('content'):
            author = author_meta['content']

        # Look for common div/span/p tags with class names containing 'author' or 'byline'
        if not author:
            author_tags = soup.find_all(['div', 'span', 'p'], class_=re.compile(r'author|byline', re.I))
            for tag in author_tags:
                text = tag.get_text(strip=True)
                if text:
                    author = text
                    break

        # Use heuristic: Search for 'By <Author>' in page text
        if not author:
            match = re.search(r'By\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)', soup.get_text())
            if match:
                author = match.group(1)

        # Fallback for unknown author
        author = author if author else "Unknown"

        return {"Author": author}

    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return {"Author": "Unknown"}

In [None]:
'''
# Example usage of extract author
if __name__ == "__main__":
    # Example URLs – Replace with actual article URLs for testing
    urls = [
        "https://www.cnn.com/us/live-news/madison-wisconsin-school-shooting-12-17-24/index.html"
    ]

    for url in urls:
        result = extract_author(url)
        print(f"URL: {url}\nAuthor: {result['Author']}")
'''

'\n# Example usage of extract author\nif __name__ == "__main__":\n    # Example URLs – Replace with actual article URLs for testing\n    urls = [\n        "https://www.cnn.com/us/live-news/madison-wisconsin-school-shooting-12-17-24/index.html"\n    ]\n\n    for url in urls:\n        result = extract_author(url)\n        print(f"URL: {url}\nAuthor: {result[\'Author\']}")\n'

In [None]:
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime

def extract_publish_date(url):
    """
    Extracts the publish date of an article from a URL's HTML content.

    Args:
        url (str): The URL of the article.

    Returns:
        str: The publish date in 'YYYY-MM-DD' format, or "Unknown" if not found.
    """
    try:
        # Fetch the HTML content
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch URL: {url} (Status Code: {response.status_code})")
            return "Unknown"

        soup = BeautifulSoup(response.text, 'html.parser')

        # 1. Look for meta tags with common date-related attributes
        meta_date_tags = [
            {'property': 'article:published_time'},
            {'name': 'date'},
            {'name': 'publishdate'},
            {'name': 'pubdate'},
            {'itemprop': 'datePublished'},
            {'property': 'og:updated_time'},
            {'property': 'og:published_time'}
        ]
        for meta_attrs in meta_date_tags:
            meta_date = soup.find('meta', attrs=meta_attrs)
            if meta_date and meta_date.get('content'):
                try:
                    return parse_date(meta_date['content'])
                except ValueError:
                    continue  # Try the next meta tag

        # 2. Look for <time> tags with datetime or class attributes
        time_tags = soup.find_all('time', attrs={'datetime': True})
        for time_tag in time_tags:
            date_text = time_tag.get('datetime')
            try:
                return parse_date(date_text)
            except ValueError:
                continue

        # 3. Look for generic tags (<span>, <p>) with date-related classes or IDs
        date_related_tags = soup.find_all(['span', 'p', 'div'], attrs={'class': re.compile(r'date|time', re.I)})
        for tag in date_related_tags:
            date_text = tag.get_text(strip=True)
            try:
                return parse_date(date_text)
            except ValueError:
                continue

        # 4. Heuristic: Search for date-like patterns in the page text
        date_pattern = re.compile(r'\b(20[0-9]{2})[-/.](0[1-9]|1[0-2])[-/.](0[1-9]|[12][0-9]|3[01])\b')
        match = date_pattern.search(soup.get_text())
        if match:
            try:
                return parse_date(match.group())
            except ValueError:
                pass

        # Fallback: Return "Unknown" if no date is found
        return "Unknown"

    except Exception as e:
        print(f"Error extracting publish date from {url}: {e}")
        return "Unknown"

def parse_date(date_str):
    """
    Attempts to parse a date string into 'YYYY-MM-DD' format.

    Args:
        date_str (str): The raw date string.

    Returns:
        str: The parsed date in 'YYYY-MM-DD' format.

    Raises:
        ValueError: If the date cannot be parsed.
    """
    date_formats = [
        "%Y-%m-%d", "%d-%m-%Y", "%Y/%m/%d", "%d/%m/%Y",
        "%Y.%m.%d", "%d.%m.%Y", "%B %d, %Y", "%d %B %Y",
        "%b %d, %Y", "%d %b %Y"
    ]
    for date_format in date_formats:
        try:
            return datetime.strptime(date_str, date_format).date().isoformat()
        except ValueError:
            continue
    raise ValueError(f"Unable to parse date: {date_str}")

In [None]:
# Define the schema for the MEDIA table
media_columns = [
    "Article ID",                  # INTEGER – Unique identifier for the article
    "Media Outlet",                 # STRING - Publisher of the article
    "Title",                       # STRING – Title for the article
    "URL",                         # STRING - URL for the article
    "Word Count",                  # INTEGER – Number of words in the article
    "Author",                      # STRING – Name of the author of the article
    "Release Date",                # DATE – Date the article was published
    "Topic",                       # STRING – The topic category of the article
    "Political Slant",             # FLOAT – Political bias ranging from -1.0 to 1.0
    "Number of Significant Phrases", # INTEGER – Number of significant phrases
    "Article Text"                 # STRING – Text from the
]

In [None]:
# Scrape File
if __name__ == "__main__":
    # Path to the CSV file (replace with the correct path)
    csv_file = "/content/article_data_uncleaned_12_13_2024.csv"

    # Call the scraping function
    scraped_data = scrape_articles(csv_file)

    # Convert the list of dictionaries into a DataFrame
    media_df = pd.DataFrame(scraped_data, columns=media_columns)

    # Print the DataFrame
    print("\nScraped Media DataFrame:")
    print(media_df)

    # Optionally, save to a new CSV file
    media_df.to_csv("scraped_articles_with_topics.csv", index=False)
    print("\nScraped data saved to 'scraped_articles_with_topics.csv'")


Scraped Media DataFrame:
    Article ID Media Outlet  \
0            1          ABC   
1            2          ABC   
2            3          ABC   
3            4          ABC   
4            5          ABC   
..         ...          ...   
95          96          FOX   
96          97          FOX   
97          98          FOX   
98          99          FOX   
99         100          FOX   

                                                Title  \
0   John Mateer commits to Oklahoma after Washingt...   
1   Benched Will Levis: Still believe I can be Tit...   
2   Fashion influencer Matilda Djerf apologizes fo...   
3   Jay-Z's lawyers challenge rapper's inclusion i...   
4   Families express helplessness after cyclone ri...   
..                                                ...   
95  OpenAI whistleblower found dead in San Francis...   
96  Breast cancer patient has ‘miraculous’ recover...   
97  Cookware, coffee, billfolds and other American...   
98  Arkansas tops the list of m

In [None]:
media_df

Unnamed: 0,Article ID,Media Outlet,Title,URL,Word Count,Author,Release Date,Topic,Political Slant,Number of Significant Phrases,Article Text
0,1,ABC,John Mateer commits to Oklahoma after Washingt...,https://abcnews.go.com/Sports/john-mateer-comm...,574,ABC News,Unknown,Sports,0.25,5,Washington State transfer quarterback John Mat...
1,2,ABC,Benched Will Levis: Still believe I can be Tit...,https://abcnews.go.com/Sports/benched-levis-ti...,573,ABC News,Unknown,Sports,-0.19,2,"NASHVILLE, Tenn. -- Titans quarterback Will Le..."
2,3,ABC,Fashion influencer Matilda Djerf apologizes fo...,https://abcnews.go.com/Entertainment/wireStory...,558,ABC News,Unknown,Celebrity,-0.21,1,A popular social media influencer has issued a...
3,4,ABC,Jay-Z's lawyers challenge rapper's inclusion i...,https://abcnews.go.com/Entertainment/wireStory...,690,ABC News,Unknown,Celebrity,0.17,10,Jay-Z’s lawyers are asking a judge to speedily...
4,5,ABC,Families express helplessness after cyclone ri...,https://abcnews.go.com/International/wireStory...,1026,ABC News,Unknown,Weather,0.58,10,Four days after the strongest cyclone in nearl...
...,...,...,...,...,...,...,...,...,...,...,...
95,96,FOX,OpenAI whistleblower found dead in San Francis...,https://www.foxnews.com/us/openai-whistleblowe...,1152,ByLandon MionFox News,Unknown,Technology,-0.80,1,"\n This material may not be published, br..."
96,97,FOX,Breast cancer patient has ‘miraculous’ recover...,https://www.foxnews.com/health/breast-cancer-p...,2079,ByMelissa RudyFox News,Unknown,Health,-0.67,3,"\n This material may not be published, br..."
97,98,FOX,"Cookware, coffee, billfolds and other American...",https://www.foxnews.com/lifestyle/american-mad...,2146,"ByAshlyn Messier,Brittany KaskoFox News",Unknown,Lifestyle,-0.70,0,"\n This material may not be published, br..."
98,99,FOX,Arkansas tops the list of most popular places ...,https://www.foxnews.com/media/arkansas-tops-li...,1417,ByNikolas LanumFox News,Unknown,Lifestyle,-0.32,1,"\n This material may not be published, br..."


In [None]:
#Article text from last article in dataframe
media_df['Article Text'][71]

In [None]:
rows_range = media_df.iloc[15:20]
print(rows_range)