### Step 1: Import Necessary Libraries

In [18]:
import pandas as pd
from nytimes_scraper.nyt_api import NytApi
from nytimes_scraper.comments import fetch_comments_by_article, comments_to_df


## Step 2: Access your API Key

In [20]:
api = NytApi('owjL3q1wD8x7tvdC8UGXBBLKsvaA6eoV')

## Step 3: Read URLs From CSV File

In [22]:
def read_urls_from_csv(file_path):
    """
    Reads a CSV file and extracts article URLs and titles.
    
    Parameters:
    file_path (str): Path to the CSV file containing article URLs.
    
    Returns:
    pd.DataFrame: A DataFrame containing article URLs and titles.
    """
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    print("Columns in the CSV file:", df.columns)
    articles_df = df[['web_url', 'title']].copy()
    return articles_df

## Step 4: Process Comments and Replies 

In [24]:
def process_comment_thread(comment, article_url, article_title, all_comments):
    """
    Recursively process a comment and its replies, adding article metadata to each.
    
    Parameters:
    comment (dict): The comment to process
    article_url (str): The URL of the article
    article_title (str): The title of the article
    all_comments (list): List to store all processed comments
    """
    # Add article metadata to the current comment
    comment['web_url'] = article_url
    comment['title'] = article_title
    
    # Store the processed comment
    all_comments.append(comment)
    
    # Process replies if they exist
    if 'replies' in comment and comment['replies']:
        for reply in comment['replies']:
            process_comment_thread(reply, article_url, article_title, all_comments)

## Step 5: Fetch Comments from the URLs

In [26]:
def fetch_comments_for_urls(api, articles_df):
    """
    Fetch comments for articles and preserve article metadata for all comments and replies.
    
    Parameters:
    api (NytApi): An instance of the NytApi class.
    articles_df (pd.DataFrame): DataFrame containing article URLs and titles.
    
    Returns:
    List[dict]: A list of comment dictionaries with article metadata.
    """
    all_comments = []
    
    for _, article in articles_df.iterrows():
        url = article['web_url']
        title = article['title']
        
        try:
            print(f"Fetching comments for: {title}")
            comments = fetch_comments_by_article(api, url)
            
            # Process each top-level comment and its replies
            for comment in comments:
                process_comment_thread(comment, url, title, all_comments)
                
        except Exception as e:
            print(f"Error fetching comments for {url}: {e}")
    
    return all_comments

## Step 6: Save Comments to a CSV File

In [28]:
def save_comments_to_csv(comments, output_file):
    """
    Save comments with article metadata to a CSV file.
    
    Parameters:
    comments (List[dict]): A list of comment dictionaries with article metadata.
    output_file (str): Path to the output CSV file.
    """
    if comments:
        # Convert the comments to a DataFrame
        comment_df = comments_to_df(comments)
        
        # Ensure article metadata columns are first
        metadata_cols = ['web_url', 'title']
        other_cols = [col for col in comment_df.columns if col not in metadata_cols]
        comment_df = comment_df[metadata_cols + other_cols]
        
        # Save the DataFrame to a CSV file
        comment_df.to_csv(output_file, index=False)
        print(f"Comments saved to {output_file}")
        
        # Print summary statistics
        print(f"\nSummary:")
        print(f"Total number of comments: {len(comment_df)}")
        print(f"Number of articles with comments: {comment_df['article_url'].nunique()}")
        print(f"Number of top-level comments: {len(comment_df[comment_df['parentID'].isna()])}")
        print(f"Number of replies: {len(comment_df[comment_df['parentID'].notna()])}")
        
        # Verify metadata consistency
        missing_metadata = comment_df[comment_df['web_url'].isna() | comment_df['title'].isna()]
        if not missing_metadata.empty:
            print(f"\nWarning: {len(missing_metadata)} comments are missing article metadata")
    else:
        print("No comments found!")

## Step 7: Main Function to Orchestrate the Process

In [30]:
def main(input_csv, output_csv, api_key):
    """
    Main function to orchestrate the comment collection process.
    
    Parameters:
    input_csv (str): Path to the input CSV file containing article URLs.
    output_csv (str): Path to the output CSV file to save comments.
    api_key (str): NYTimes API key.
    """
    api = NytApi(api_key)
    articles_df = read_urls_from_csv(input_csv)
    comments = fetch_comments_for_urls(api, articles_df)
    save_comments_to_csv(comments, output_csv)

## Step 8: Running the Script

In [32]:
# Example usage
if __name__ == "__main__":
    input_csv = "dronearticles.csv"
    output_csv = "drone_comments.csv"
    api_key = "owjL3q1wD8x7tvdC8UGXBBLKsvaA6eoV" # Replace with your actual API key
    
    main(input_csv, output_csv, api_key)

Columns in the CSV file: Index(['Unnamed: 0', 'abstract', 'web_url', 'snippet', 'lead_paragraph',
       'source', 'multimedia', 'headline', 'keywords', 'pub_date',
       'document_type', 'news_desk', 'section_name', 'byline',
       'type_of_material', '_id', 'word_count', 'uri', 'print_section',
       'print_page', 'subsection_name', 'title'],
      dtype='object')
Fetching comments for: Hochul Launches Investigation Into Drone Sightings in New York
Fetching comments for: Are Those Drones Over New Jersey? Despite Sightings, U.S. Is Skeptical.
Fetching comments for: Late Night Continues to Drone On About Sightings in the Sky
Fetching comments for: Weeks of Drone Sightings Leave New Jersey on Edge
Fetching comments for: No Evidence Drones in Northeast Are a Threat, Officials Say
Fetching comments for: U.S. Officials Say Sightings Are Mix of Planes, Stars and Legal Drones
Fetching comments for: How Drone Fever Spread Across New Jersey and Beyond
Fetching comments for: White House Says

  df[col] = pd.to_datetime(df[col], unit='s')
  df[col] = pd.to_datetime(df[col], unit='s')
  df[col] = pd.to_datetime(df[col], unit='s')


KeyError: 'article_url'