In [None]:
import requests
from datetime import datetime
import time


api_key = "nSJlJTCMVqCArRKWgg3W4tK9LenCmsMd"
base_url = "https://api.nytimes.com/svc/archive/v1/{year}/{month}.json"

start_year = 2013
start_month = 1
end_year = datetime.now().year
end_month = datetime.now().month
search_keywords = ['Apple', 'iPhone', 'MacBook', 'iPad', 'Apple Watch', 'iOS', 'MacOS']

all_articles = []

for year in range(start_year, end_year + 1):
    for month in range(1, 13):
        if year == start_year and month < start_month:
            continue
        if year == end_year and month > end_month:
            break

        print(f"Fetching articles for {year}-{month}...") 
        url = base_url.format(year=year, month=month)
        params = {'api-key': api_key}
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data['response']['docs']
            
            for article in articles:
                for keyword in search_keywords:
                    if keyword.lower() in article['headline']['main'].lower():
                        all_articles.append(article)
                        break  
            
            print(f"Found {len(articles)} articles in {year}-{month}")
        elif response.status_code == 429:
            print(f"Rate limit hit: Error 429 for {year}-{month}. Waiting for 60 seconds...")
            time.sleep(60) 
            continue
        else:
            print(f"Error fetching articles for {year}-{month}: {response.status_code}")
            break

        time.sleep(6)

print(f"Total articles related to Apple products: {len(all_articles)}")

In [None]:
import pandas as pd

pd.DataFrame(all_articles).to_csv('Apple_News_Articles_2013_1_2024_10.csv')

In [None]:
# !pip install pandas

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
news_data = pd.read_csv('Apple_News_Articles_2013_1_2024_10.csv')

In [3]:
news_data.shape

(2712, 21)

In [4]:
news_data.columns

Index(['Unnamed: 0', 'abstract', 'web_url', 'snippet', 'lead_paragraph',
       'print_section', 'print_page', 'source', 'multimedia', 'headline',
       'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name',
       'byline', 'type_of_material', '_id', 'word_count', 'uri',
       'subsection_name'],
      dtype='object')

In [5]:
news_data.shape

(2712, 21)

In [6]:
filtered_columns_data = news_data[['abstract','snippet','lead_paragraph','headline','pub_date']]
import ast
filtered_columns_data = pd.concat([filtered_columns_data.drop('headline',axis = 1),pd.json_normalize([ast.literal_eval(item) for item in list(filtered_columns_data['headline'])])[['main','kicker','print_headline']]],axis = 1)
print('Checking for null values')
display({i:filtered_columns_data[i].isna().sum() for i in filtered_columns_data})
print('Removing NULL values')
display([filtered_columns_data[i].fillna('',inplace=True) for i in filtered_columns_data])
print('Checking for null values')
display({i:filtered_columns_data[i].isna().sum() for i in filtered_columns_data})

Checking for null values


{'abstract': 3,
 'snippet': 20,
 'lead_paragraph': 14,
 'pub_date': 0,
 'main': 0,
 'kicker': 1569,
 'print_headline': 0}

Removing NULL values


[None, None, None, None, None, None, None]

Checking for null values


{'abstract': 0,
 'snippet': 0,
 'lead_paragraph': 0,
 'pub_date': 0,
 'main': 0,
 'kicker': 0,
 'print_headline': 0}

In [7]:
text = []
for j in range(filtered_columns_data.shape[0]):
    str1=''
    for i in [0,1,2,4,5,6]:
        str1 += filtered_columns_data.iloc[j,i]
    text.append(str1)
# filtered_columns_data.iloc[:,5:]
filtered_columns_data['text_in_article'] = text
display(filtered_columns_data.columns)
display(filtered_columns_data.shape)


Index(['abstract', 'snippet', 'lead_paragraph', 'pub_date', 'main', 'kicker',
       'print_headline', 'text_in_article'],
      dtype='object')

(2712, 8)

In [None]:
# !pip install transformers
# !pip install tf-keras
# !pip install tqdm





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\janan\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip





In [None]:
import pandas as pd
from transformers import pipeline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Setup tqdm to show progress in apply functions
tqdm.pandas()

# Download necessary NLTK data
# nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer and Summarizer
sid = SentimentIntensityAnalyzer()
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Placeholder for your filtered data
df = filtered_columns_data

# Define functions for summarization and sentiment analysis
def get_dynamic_lengths(text):
    input_length = len(text.split())  
    dynamic_max_length = min(100, int(input_length / 2))
    dynamic_min_length = min(dynamic_max_length - 10, 25) 
    return dynamic_max_length, dynamic_min_length

def summarize_text(text):
    if len(text) > 50:
        dynamic_max_length, dynamic_min_length = get_dynamic_lengths(text)
        summary = summarizer(text, max_length=dynamic_max_length, min_length=dynamic_min_length, do_sample=False)
        return summary[0]['summary_text']
    return text

def get_sentiment(text):
    return sid.polarity_scores(text)

# Step 1: Summarize Text with Progress Bar and Parallel Processing
with ThreadPoolExecutor() as executor:
    df['summary'] = list(tqdm(executor.map(summarize_text, df['text_in_article']), total=len(df), desc="Summarizing"))

# Step 2: Perform Sentiment Analysis with Progress Bar
df['sentiment'] = df['summary'].progress_apply(get_sentiment)
df_sentiment = df['sentiment'].apply(pd.Series)
df = pd.concat([df, df_sentiment], axis=1)

# sentiment_by_date = df.groupby('pub_date').mean()
# print(sentiment_by_date)


In [15]:
# import pandas as pd
# from transformers import pipeline
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import nltk

# # nltk.download('vader_lexicon')
# sid = SentimentIntensityAnalyzer()
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # ,framework='pt'

# df = filtered_columns_data

# def get_dynamic_lengths(text):
#     input_length = len(text.split())  
#     dynamic_max_length = min(100, int(input_length / 2))
#     dynamic_min_length = min(dynamic_max_length - 10, 25) 
#     return dynamic_max_length, dynamic_min_length

# def summarize_text(text):
#     if len(text) > 50:
#         dynamic_max_length, dynamic_min_length = get_dynamic_lengths(text)

#         summary = summarizer(text, max_length=dynamic_max_length, min_length=dynamic_min_length, do_sample=False)
#         return summary[0]['summary_text']
#     return text  

# def get_sentiment(text):
#     return sid.polarity_scores(text)

# df['summary'] = df['text_in_article'].apply(summarize_text)
# df['sentiment'] = df['summary'].apply(get_sentiment)
# df_sentiment = df['sentiment'].apply(pd.Series)

# df = pd.concat([df, df_sentiment], axis=1)

# sentiment_by_date = df.groupby('pub_date').mean()
# print(sentiment_by_date)


In [14]:
df.to_csv('articles_sentiment.csv')

In [13]:
df.head()

Unnamed: 0,abstract,snippet,lead_paragraph,pub_date,main,kicker,print_headline,text_in_article,summary,sentiment,neg,neu,pos,compound
0,"For all of its comments and reviews, does Trip...","For all of its comments and reviews, does Trip...","In October, on assignment to find the cheapest...",2013-01-01T16:37:28+0000,Using TripAdvisor? Some Advice,Frugal Traveler,Using TripAdvisor? Some Advice,"For all of its comments and reviews, does Trip...",Using TripAdvisor? Some Advice. The Frugal Tra...,"{'neg': 0.0, 'neu': 0.779, 'pos': 0.221, 'comp...",0.0,0.779,0.221,0.8374
1,"Tips for using TripAdvisor, a bigger tax bite ...","Tips for using TripAdvisor, a bigger tax bite ...",,2013-01-02T13:48:39+0000,Wednesday Reading: Tips for Using TripAdvisor,Bucks,,"Tips for using TripAdvisor, a bigger tax bite ...","Tips for using TripAdvisor, a bigger tax bite ...","{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",0.0,0.85,0.15,0.5719
2,"On the morning of New Year’s Day, many iPhone ...","On the morning of New Year’s Day, many iPhone ...",2:01 p.m. | Updated Adding response from Apple.,2013-01-02T18:55:25+0000,"The iPhone Goofs Up on Telling Time, Again",Bits,,"On the morning of New Year’s Day, many iPhone ...","On the morning of New Year’s Day, many iPhone ...","{'neg': 0.052, 'neu': 0.783, 'pos': 0.165, 'co...",0.052,0.783,0.165,0.5358
3,"Cygnett, an Australian maker of gadget accesso...","Cygnett, an Australian maker of gadget accesso...","Cygnett, an Australian maker of gadget accesso...",2013-01-03T12:19:33+0000,Protect Your iPhone With a Vision From Australia,Gadgetwise,Protect Your Phone With a Vision From Australia,"Cygnett, an Australian maker of gadget accesso...","Cygnett, an Australian maker of gadget accesso...","{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",0.0,0.806,0.194,0.7184
4,MediaTek of Taiwan not only provides manufactu...,MediaTek of Taiwan not only provides manufactu...,"TAIPEI — In the China smartphone market, Apple...",2013-01-07T03:04:20+0000,Providing a Template to Challenge Apple,,MediaTek Chips Change China’s Smartphone Market,MediaTek of Taiwan not only provides manufactu...,MediaTek of Taiwan provides manufacturers with...,"{'neg': 0.053, 'neu': 0.947, 'pos': 0.0, 'comp...",0.053,0.947,0.0,-0.128
