In [None]:
import requests 
from bs4 import BeautifulSoup
import time

# Set url with all the archive links 

url= "https://www.wired.com/sitemap/"
urlPages = [] #empty list to store archive links 

In [None]:
# Download url page with links using BeautifulSoup

page = requests.get(url, verify=False)
soup = BeautifulSoup(page.content, "html.parser")

print(soup.prettify())

In [None]:
# Find all links in the sitemap section

urlList = soup.find("div", class_="sitemap__section-archive")
#print(urlList)

links = urlList.find_all('a')
#print(links)



In [None]:
# Extract the links between 1990 and 2001

for link in links:
    href = link.get('href')
    if href:
        # Debugging: Print the href to check its format
        print(f"Checking href: {href}")
        
        # Extract year from the href attribute
        year_index = href.find("year=")
        if year_index != -1:
            # Debugging: Print the year substring to verify extraction
            year_str = href[year_index + 5: year_index + 9]
            print(f"Extracted year substring: {year_str}")
            
            # Ensure the extracted substring is a digit
            if year_str.isdigit():
                year = int(year_str)
                
                # Check if the year falls within 1990 to 2001
                if 1990 <= year <= 2001:
                    full_link = "https://www.wired.com" + href
                    urlPages.append(full_link)
                    # Debugging: Print the full link being added
                    print(f"Added link: {full_link}")


#print("Final list of URLs:")
#print(urlPages)


In [None]:
print(len(urlPages))

In [None]:
# Define function to extract article links from each issue URL


def extract_article_links(issue_url):
    full_article_links = []
    page = requests.get(issue_url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Find the articles section
    articles = soup.find("div", class_="sitemap__section-archive")
    
    if not articles:
        # If no articles section is found, return a message or empty list
        print(f"No articles section found for URL: {issue_url}")
        return "No articles section found"
    
    # Find all article links within the section
    article_links = articles.find_all('a')
    
    if not article_links:
        # If no article links are found, return a message or empty list
        print(f"No articles found for URL: {issue_url}")
        return "No articles found"
    
    # Cycle through extracted links and appennd base URL
    for link in article_links:
        href = link.get('href')
        if href:
            full_link = "https://www.wired.com" + href
            full_article_links.append(full_link)
    
    return full_article_links



In [None]:
# Set empty dictionary with years as keys with defined lists for issue urls and assoicated article links for each issue 

chosen_links_articles = {}

# Cycle through all issue URLs 
for chosen_link in urlPages:
    year_str = chosen_link.split("year=")[-1]  # Split at "year=" and take the last part
    year = year_str.split("&")[0]  # Split at "&" to remove the rest of the URL parameters
    article_links = extract_article_links(chosen_link)
    if year not in chosen_links_articles:
        chosen_links_articles[year] = {'issue_links': [], 'articles': []}  # Update Year as key 
    chosen_links_articles[year]['issue_links'].append(chosen_link)  # Add the issue link to the list for the year
    chosen_links_articles[year]['articles'].extend(article_links)  # Add article links to the list for the year

    
for year, data in chosen_links_articles.items():
    print(f"Year: {year}")
    print("Issue URLs:")
    for issue_link in data['issue_links']:
        print(issue_link)
    print("Article URLs:")
    for article_link in data['articles']:
        print(article_link)
    print()


In [None]:
def fetch_url(url, retries=5, backoff_factor=1):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)  # Increase the timeout duration
            response.raise_for_status()  
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} for URL {url} failed: {e}")
            time.sleep(backoff_factor * (2 ** attempt))
    return None

In [None]:
# Define function to extract all the relevant information: Title, Year, Summary Description, Tag category, Article text content 
# from article URLs and return a dictionary 

def extract_article_info(article_url):
    # Request the page and parse it with BeautifulSoup
    page = fetch_url(article_url)
    if page is None:
        return None
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Find title
    title_element = soup.find('h1', {'data-testid': 'ContentHeaderHed', 'class': 'BaseWrap-sc-gjQpdd BaseText-ewhhUZ ContentHeaderHed-NCyCC iUEiRd htVlUB kctZMs'})
    title = title_element.text.strip() if title_element else None
    
    # Find year
    time_element = soup.find('time', {'data-testid': 'ContentHeaderPublishDate', 'class': 'BaseWrap-sc-gjQpdd BaseText-ewhhUZ ContentHeaderTitleBlockPublishDate-hYmSqb iUEiRd jpVMoQ cXawal'})
    datetime_value = time_element.get('datetime') if time_element else None
    year = datetime_value[:4] if datetime_value else None
    
    # Find summary
    summary_element = soup.find('div', class_='ContentHeaderDek-bIqFFZ fOichq')
    summary = summary_element.text.strip() if summary_element else None
    
    # Find tag category 
    tag_element = soup.find('span', class_='RubricName-fVtemz cLxcNi rubric__name')
    tag = tag_element.text.strip() if tag_element else None
    
    # Find text content
    body_element = soup.find('div', class_='body__inner-container')
    text_content = body_element.text.strip() if body_element else None
    
  
    
    
    
    ''
    return {
        'title': title,
        'year': year,
        'summary': summary,
        'tag': tag,
        'text_content': text_content
    }




In [None]:
import time

# Define function to extraact the article information from all the article URLs

def extract_articles_info(article_links):
    articles_info = []
    for article_link in article_links:
        time.sleep(1)  # Delay to avoid hitting the server too hard
        article_info = extract_article_info(article_link)
        if article_info:  # Check article_info is not None 
            articles_info.append(article_info)
    return articles_info




In [None]:
import os

# Define function to scrape and save article information into csv files in batches of 100 


def scrape_and_save_articles(chosen_links_articles, batch_size=100):
    for year, data in chosen_links_articles.items():
        issue_links = data['issue_links']
        articles_links = data['articles']
        
        total_articles = len(articles_links)
        start_batch = 0

        # Check for existing files to determine the next batch to process
        while True:
            batch_csv_filename = f'articles_{year}_batch_{start_batch + 1}.csv'
            if os.path.exists(batch_csv_filename):
                start_batch += 1
            else:
                break
        
        for i in range(start_batch * batch_size, total_articles, batch_size):
            batch_articles_links = articles_links[i:i + batch_size]  # Get the next batch of articles
            batch_articles_info = extract_articles_info(batch_articles_links)  # Extract article information
            
            # Convert the list of dictionaries to a DataFrame
            batch_articles_df = pd.DataFrame(batch_articles_info)
            
            # Save to a CSV file for the current batch
            batch_csv_filename = f'articles_{year}_batch_{i // batch_size + 1}.csv'
            batch_articles_df.to_csv(batch_csv_filename, index=False, encoding='utf-8')
            
            print(f"Articles for year {year}, batch {i // batch_size + 1} saved to {batch_csv_filename}")



In [None]:
#scrape_and_save_articles_in_chunks(chosen_links_articles)

In [None]:
import glob

# Define function to merge the CSV batches for each year into a single file 

def merge_batch_csvs_to_yearly_csvs(chosen_links_articles):
    for year in chosen_links_articles.keys():
        all_files = glob.glob(f'articles_{year}_batch_*.csv')  # Get all batch files for the specified year
        all_df = []

        for file in all_files:
            df = pd.read_csv(file)
            all_df.append(df)

        merged_df = pd.concat(all_df, ignore_index=True)
        yearly_csv_filename = f'articles_{year}.csv'
        merged_df.to_csv(yearly_csv_filename, index=False, encoding='utf-8')
        
        print(f"All batches for year {year} merged into {yearly_csv_filename}")

In [None]:
#merge_batch_csvs_to_yearly_csvs(chosen_links_articles)

In [None]:
# Define function to merge all yearly CSV files into a single file with full dataset 

def merge_all_yearly_csvs(chosen_links_articles):
    all_yearly_files = [f'articles_{year}.csv' for year in chosen_links_articles.keys() if os.path.exists(f'articles_{year}.csv')]
    if not all_yearly_files:
        print("No yearly files found to merge.")
        return

    all_df = []

    for file in all_yearly_files:
        df = pd.read_csv(file)
        all_df.append(df)

    final_merged_df = pd.concat(all_df, ignore_index=True)
    final_csv_filename = 'all_articles_merged.csv'
    final_merged_df.to_csv(final_csv_filename, index=False, encoding='utf-8')
    
    print(f"All yearly CSV files merged into {final_csv_filename}")

In [None]:
#merge_all_yearly_csvs(chosen_links_articles)