In [1]:
from pynytimes import NYTAPI
import os
import datetime
import time
import json
import requests
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joeyared/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
def sentiment_analysis(text):
    if not text:
        return 'Neutral', {'compound': 0.0}
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        sentiment = 'Positive'
    elif scores['compound'] <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return sentiment, scores

In [4]:
def search_nyt_articles(nyt, query, start_date, end_date, max_results=25):
    try: 
        articles = nyt.article_search(
            query=query,
            results=max_results,
            # Search for articles in 
            dates = {
                "begin": start_date,
                "end": end_date
            },
            options = {
                "sort": "newest" # Sort by new
            }
        )
        return articles
    except Exception as e:
        print(f"Error fetching articles: {e}")
        return []

In [5]:
def get_article_data(articles): 
    article_data = []
    for article in articles:
        try:
            # text = article.get('headline', {}).get('main', '')
            text = article.get('lead_paragraph', '')
            sentiment, score = sentiment_analysis(text)
            data = {
                'headline': article.get('headline', {}).get('main', ''),
                'lead_paragraph': article.get('lead_paragraph', ''),
                'abstract': article.get('abstract', ''),
                'keywords': ', '.join([kw.get('value', '') for kw in article.get('keywords', [])]),
                'pub_date': article.get('pub_date', ''),
                'sentiment': sentiment, 
                'sentiment_score': score['compound']
            }
            article_data.append(data)
        except Exception as e:
            print(f"Error processing article: {e}")
            continue
        
    return pd.DataFrame(article_data)

In [6]:
def main():
    nyt = NYTAPI("6PHpPgcrP9AlMU82J12ty8e6QaKfm8PU", parse_dates=True)
    keywords = ['gaza', 'palestine', 'palestinian refugees', 'israel palestine conflict', 'israel', 'hamas', 'gaza protests']    
    start_date = datetime.datetime(2023, 1, 1)
    end_date = datetime.datetime(2024, 11, 1)
    all_articles_data = []
    for search_word in keywords:
        print(f"\nSearching for: {search_word}")
        articles = search_nyt_articles(nyt, search_word, start_date, end_date)
        if articles:
            article_data = get_article_data(articles)
            all_articles_data.extend(article_data)
            print(f"Found {len(article_data)} articles for '{search_word}'")
        else:
            print(f"No articles found for '{search_word}'")
    if all_articles_data:
        filename = f"nyt_articles_combined.csv"
        all_articles_data = pd.DataFrame(all_articles_data)
        all_articles_data.to_csv(filename, index=False)

In [7]:
if __name__ == "__main__":
    main()


Searching for: gaza
Found 30 articles for 'gaza'

Searching for: palestine
Found 30 articles for 'palestine'

Searching for: palestinian refugees
Found 30 articles for 'palestinian refugees'

Searching for: israel palestine conflict
Found 30 articles for 'israel palestine conflict'

Searching for: israel
Found 30 articles for 'israel'

Searching for: hamas
Found 30 articles for 'hamas'

Searching for: gaza protests
Found 30 articles for 'gaza protests'


In [8]:
# url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q=israel&api-key=6PHpPgcrP9AlMU82J12ty8e6QaKfm8PU'
#     r = requests.get(url)
#     json_data = r.json()
#     json_string = json.dumps(json_data)

# results = json_data.get("results")
# results_string = json.dumps(results)
# df = pd.read_json(results_string)
# print(json_string)
# articles = nyt.article_search(
#     query = "Israeli–Palestinian Conflict",
#     results = 5,
#     # Search for articles in January and February 2019
#     dates = {
#         "begin": datetime.datetime(2018, 1, 30),
#         "end": datetime.datetime(2024, 8, 28)
#     },
#     options = {
#         "sort": "oldest", # Sort by oldest options
#         # Return articles from the following four sources
#         "sources": [
#             "New York Times",
#             "AP",
#             "Reuters",
#             "International Herald Tribune"
#         ],
#         # Only get information from the Politics desk
#         "news_desk": [
#             "Politics"
#         ],
#         # Only return ....
#         "type_of_material": [
#             "News Analysis"
#         ],
#         # The article text should contain the word..
#         "body": [
#             "death"
#         ],
#         # Headline should contain...
#         "headline": [
#             "conflict",
#             "war",
#             "toll"
#         ]
#     }
# )

# articles = nyt.article_search(query="Israeli–Palestinian Conflict", results=1)
# print(articles[1])