### Data Collection

In [4]:
import requests
import csv
import os

# Replace with your actual New York Times API key
API_KEY = '2igOgJ38GRvTfJ4dRmSHGCL0jJmtN1hA' 
BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

def fetch_articles(query, start_year, end_year):
    articles = []
    page = 0
    
    while True:
        params = {
            'api-key': API_KEY,
            'q': query,
            'begin_date': f'{start_year}0101',
            'end_date': f'{end_year}1231',
            'page': page
        }
        
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        
        if 'response' not in data or 'docs' not in data['response']:
            break

        docs = data['response']['docs']
        if not docs:
            break

        articles.extend(docs)
        page += 1

    return articles

def save_to_csv(query, articles):
    file_exists = os.path.isfile('newsdata.csv')
    
    with open('newsdata.csv', mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header only if the file does not exist
        if not file_exists:
            writer.writerow(['Query', 'Headline', 'Publication Date', 'Snippet', 'URL'])
        
        for article in articles:
            headline = article.get('headline', {}).get('main', 'No headline')
            pub_date = article.get('pub_date', 'No publication date')
            snippet = article.get('snippet', 'No snippet')
            web_url = article.get('web_url', 'No URL')
            
            writer.writerow([query, headline, pub_date, snippet, web_url])


def main():
    queries = [
        "A.I.",
        "Artificial Intelligence",
        "Augmented reality",
        "Automation",
        "Chatbot",
        "Data Science",
        "Deepfake",
        "GPT",
        "M.L.",
        "Machine Learning",
        "Natural Language Processing",
        "NLP",
        "Virtual Reality"
    ]

    start_year = 2013
    end_year = 2024

    print(f"Fetching articles from {start_year} to {end_year}...")
    for query in queries:
        articles = fetch_articles(query, start_year, end_year)
    
        
        for article in articles:
            metadata = {
                'headline': article.get('headline', {}).get('main', 'No headline'),
                'pub_date': article.get('pub_date', 'No publication date'),
                'snippet': article.get('snippet', 'No snippet'),
                'web_url': article.get('web_url', 'No URL')
            }
        save_to_csv(query, articles)

main()

Fetching articles from 2013 to 2024...


### Priliminary Data Cleaning

In [7]:
import pandas as pd

df = pd.read_csv('newsdata.csv')

df['Publication Date'] = pd.to_datetime(df['Publication Date'], errors='coerce')

df.sort_values(by='Publication Date', inplace=True)
df.to_csv('data.csv', index=False)

print("CSV file sorted and duplicates removed. Saved as 'data.csv'")


CSV file sorted and duplicates removed. Saved as 'data.csv'
