# API Data Extractor
Last code chunk is what worked. I'll clean up this notebook this week and refine the data extraction process.

In [None]:
import requests
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import time
import certifi
import random
from datetime import datetime, timedelta



# Pull articles with sentiment scores

In [56]:
import certifi
import json
from urllib.request import urlopen
import time

def get_jsonparsed_data(url):
    """
    Fetch JSON data from the given URL.

    Parameters:
    url (str): The URL to fetch data from.

    Returns:
    dict: The parsed JSON data.
    """
    response = urlopen(url, cafile=certifi.where())
    data = response.read().decode("utf-8")
    return json.loads(data)

def fetch_stock_news_sentiments(api_key, page=0):
    """
    Fetch stock news sentiments from Financial Modeling Prep API.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    page (int): Page number for pagination. Default is 0.

    Returns:
    list: The parsed JSON data containing news sentiments.
    """
    url = f"https://financialmodelingprep.com/api/v4/stock-news-sentiments-rss-feed?page={page}&apikey={api_key}"
    return get_jsonparsed_data(url)

def save_news_sentiments_to_file(all_news_sentiments, filename):
    """
    Save all news sentiments to a single JSON file.

    Parameters:
    all_news_sentiments (list): The list of all news sentiments to save.
    filename (str): The name of the file to save the news sentiments.
    """
    with open(filename, 'w') as file:
        json.dump(all_news_sentiments, file, indent=4)
    print(f"All news sentiments saved to {filename}")

def fetch_and_save_all_news(api_key, max_pages=5):
    """
    Fetch and save news articles into a single JSON file.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    max_pages (int): Maximum number of pages to fetch. Default is 5.
    """
    all_news_sentiments = []
    for page in range(max_pages):
        print(f"Fetching page {page}...")
        news_sentiments = fetch_stock_news_sentiments(api_key, page)
        if not news_sentiments:
            break  # Exit the loop if no more news is returned
        all_news_sentiments.extend(news_sentiments)
        time.sleep(.1)  # Sleep to avoid hitting API rate limits

    save_news_sentiments_to_file(all_news_sentiments, 'all_news_sentiments.json')



In [57]:

# Example usage
api_key = '28a3eef526c43ab5888ab02222aada18'

# Specify the number of pages you want to fetch
max_pages = 100

fetch_and_save_all_news(api_key, max_pages)

Fetching page 0...


  response = urlopen(url, cafile=certifi.where())


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Fetching 

## Convert to pandas dataframe

In [58]:
import pandas as pd
import json

def json_to_dataframe(json_file):
    """
    Convert JSON file to a pandas DataFrame.

    Parameters:
    json_file (str): The path to the JSON file.

    Returns:
    DataFrame: A pandas DataFrame containing the JSON data.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    # Create a DataFrame from the list of articles
    df = pd.DataFrame(data)
    return df

# Example usage
json_file = 'all_news_sentiments.json'
df = json_to_dataframe(json_file)
print(df.head())  # Print the first few rows of the DataFrame

  symbol             publishedDate  \
0     BA  2024-07-01T12:16:08.000Z   
1   APOG  2024-07-01T12:15:29.000Z   
2    QSR  2024-07-01T12:15:00.000Z   
3    TDC  2024-07-01T12:15:00.000Z   
4   VKTX  2024-07-01T12:15:00.000Z   

                                               title  \
0  Boeing Agrees To Reacquire Spirit AeroSytems f...   
1                 New Strong Buy Stocks for July 1st   
2  Tims China Announces Significant Financing fro...   
3  TDC LAWSUIT NOTIFICATION: BFA Law Notifies Ter...   
4  Viking Therapeutics Stock Is Up by 154% Since ...   

                                               image               site  \
0  https://cdn.snapi.dev/images/v1/g/j/ba16-24797...   investopedia.com   
1  https://cdn.snapi.dev/images/v1/1/d/building-m...          zacks.com   
2  https://cdn.snapi.dev/images/v1/m/g/press19-25...  globenewswire.com   
3  https://cdn.snapi.dev/images/v1/y/6/press11-25...  globenewswire.com   
4  https://cdn.snapi.dev/images/v1/j/i/biotech9-2...       

### Write to CSV and Parquet

In [59]:
df.to_csv('all_news_sentiments.csv', index=False)

In [60]:
df.to_parquet('all_news_sentiments.parquet', index=False)

# Pull Articles with sentiment scores, adjust date range

In [55]:
import certifi
import json
from urllib.request import urlopen
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def get_jsonparsed_data(url):
    """
    Fetch JSON data from the given URL.

    Parameters:
    url (str): The URL to fetch data from.

    Returns:
    dict: The parsed JSON data.
    """
    response = urlopen(url, cafile=certifi.where())
    data = response.read().decode("utf-8")
    return json.loads(data)

def fetch_stock_news_sentiments(api_key, ticker, date_from, date_to, limit=50):
    """
    Fetch stock news sentiments from Financial Modeling Prep API.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    ticker (str): The stock ticker symbol (e.g., 'AAPL' for Apple).
    date_from (str): Start date for fetching articles (YYYY-MM-DD).
    date_to (str): End date for fetching articles (YYYY-MM-DD).
    limit (int): Number of articles to fetch per request. Default is 50.

    Returns:
    list: The parsed JSON data containing news sentiments.
    """
    url = f"https://financialmodelingprep.com/api/v4/stock-news-sentiments-rss-feed?symbol={ticker}&from={date_from}&to={date_to}&apikey={api_key}&limit={limit}"
    return get_jsonparsed_data(url)

def save_news_sentiments_to_file(all_news_sentiments, filename):
    """
    Save all news sentiments to a single JSON file.

    Parameters:
    all_news_sentiments (list): The list of all news sentiments to save.
    filename (str): The name of the file to save the news sentiments.
    """
    with open(filename, 'w') as file:
        json.dump(all_news_sentiments, file, indent=4)
    print(f"All news sentiments saved to {filename}")

def fetch_and_save_all_news(api_key, tickers, start_date, end_date, interval_days=7, limit=10):
    """
    Fetch and save news articles for a list of tickers into a single JSON file over a specified timeframe.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    tickers (list): A list of ticker symbols.
    start_date (str): Start date for fetching articles (YYYY-MM-DD).
    end_date (str): End date for fetching articles (YYYY-MM-DD).
    interval_days (int): Number of days in each interval for fetching articles. Default is 7.
    limit (int): Number of articles to fetch per request. Default is 50.
    """
    all_news_sentiments = []
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    while current_date < end_date:
        next_date = current_date + timedelta(days=interval_days)
        for ticker in tickers:
            print(f"Fetching news for {ticker} from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}...")
            news_sentiments = fetch_stock_news_sentiments(api_key, ticker, current_date.strftime('%Y-%m-%d'), next_date.strftime('%Y-%m-%d'), limit)
            if news_sentiments:
                all_news_sentiments.extend(news_sentiments)
            time.sleep(1)  # Sleep to avoid hitting API rate limits
        current_date = next_date

    save_news_sentiments_to_file(all_news_sentiments, f'{savename}.json')

def fetch_sp500_tickers():
    """
    Fetch the list of S&P 500 ticker symbols from Wikipedia.

    Returns:
    list: A list of S&P 500 ticker symbols.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch data from Wikipedia: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'constituents'})
    tickers = []

    for row in table.find_all('tr')[1:]:
        ticker = row.find_all('td')[0].text.strip()
        tickers.append(ticker)

    return tickers



In [None]:
# Example usage
api_key = '28a3eef526c43ab5888ab02222aada18'
sp500_tickers = fetch_sp500_tickers()
sp500_tickers = sp500_tickers[:1]  # Fetch news for the first n tickers
savename = 'test'

# Specify the date range
start_date = '2023-01-01'
end_date = '2023-02-28'

if sp500_tickers:
    fetch_and_save_all_news(api_key, ['AAPL'], start_date, end_date)

## Convert to pandas dataframe

In [None]:
import pandas as pd
import json

def json_to_dataframe(json_file):
    """
    Convert JSON file to a pandas DataFrame.

    Parameters:
    json_file (str): The path to the JSON file.

    Returns:
    DataFrame: A pandas DataFrame containing the JSON data.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    # Create a DataFrame from the list of articles
    df = pd.DataFrame(data)
    return df

# Example usage
json_file = 'test.json'
df = json_to_dataframe(json_file)
print(df.head())  # Print the first few rows of the DataFrame

### Write to CSV and Parquet

In [None]:
df.to_csv(f'{savename}.csv', index=False)

In [None]:
df.to_parquet('all_news_sentiments.parquet', index=False)

# General Stock news Extractor
For extracting general stock news. 

In [None]:

def fetch_stock_news(api_key, ticker, date_from, date_to, limit=50):
    """
    Fetch stock news from Financial Modeling Prep API.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    ticker (str): The stock ticker symbol (e.g., 'AAPL' for Apple).
    date_from (str): Start date for fetching articles (YYYY-MM-DD).
    date_to (str): End date for fetching articles (YYYY-MM-DD).
    limit (int): Number of articles to fetch per request. Default is 50.

    Returns:
    list: The parsed JSON data containing news articles.
    """
    url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={ticker}&from={date_from}&to={date_to}&limit={limit}&apikey={api_key}"
    return get_jsonparsed_data(url)

def save_news_to_file(all_news, filename):
    """
    Save all news articles to a single JSON file.

    Parameters:
    all_news (list): The list of all news articles to save.
    filename (str): The name of the file to save the news articles.
    """
    with open(filename, 'w') as file:
        json.dump(all_news, file, indent=4)
    print(f"All news articles saved to {filename}")

def fetch_and_save_all_news(api_key, tickers, start_date, end_date, interval_days=7, max_articles_per_interval=50):
    """
    Fetch and save news articles for a list of tickers into a single JSON file over a specified timeframe.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    tickers (list): A list of ticker symbols.
    start_date (str): Start date for fetching articles (YYYY-MM-DD).
    end_date (str): End date for fetching articles (YYYY-MM-DD).
    interval_days (int): Number of days in each interval for fetching articles. Default is 7.
    max_articles_per_interval (int): Maximum number of articles to fetch per interval. Default is 50.
    """
    all_news = []
    current_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    while current_date < end_date:
        next_date = current_date + timedelta(days=interval_days)
        for ticker in tickers:
            print(f"Fetching news for {ticker} from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}...")
            random_start_date = current_date + timedelta(days=random.randint(0, interval_days - 1))
            random_end_date = random_start_date + timedelta(days=random.randint(1, interval_days - (random_start_date - current_date).days))
            limit = random.randint(1, max_articles_per_interval)
            news_articles = fetch_stock_news(api_key, ticker, random_start_date.strftime('%Y-%m-%d'), random_end_date.strftime('%Y-%m-%d'), limit)
            if news_articles:
                # Filter articles to include only those for the specified ticker
                filtered_news_articles = [article for article in news_articles if ticker in article.get('symbol', '')]
                all_news.extend(filtered_news_articles)
            time.sleep(1)  # Sleep to avoid hitting API rate limits
        current_date = next_date

    save_news_to_file(all_news, 'all_stock_news.json')




In [None]:
# Example usage
api_key = '28a3eef526c43ab5888ab02222aada18'
sp500_tickers = fetch_sp500_tickers()

# Limit the tickers for testing
limited_tickers = sp500_tickers #[:1]

# Specify the date range
start_date = '2023-10-01'
end_date = '2023-12-31'

if limited_tickers:
    fetch_and_save_all_news(api_key, limited_tickers, start_date, end_date)

## Convert to pandas dataframe

In [None]:
import pandas as pd
import json

def json_to_dataframe(json_file):
    """
    Convert JSON file to a pandas DataFrame.

    Parameters:
    json_file (str): The path to the JSON file.

    Returns:
    DataFrame: A pandas DataFrame containing the JSON data.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    # Create a DataFrame from the list of articles
    df = pd.DataFrame(data)
    return df

# Example usage
json_file = 'all_stock_news.json'
df = json_to_dataframe(json_file)
print(df.head())  # Print the first few rows of the DataFrame

### Write to CSV and Parquet

In [None]:
df.to_csv('all_news.csv', index=False)

In [None]:
df.to_parquet('all_news.parquet', index=False)

# Pull Stock Price data

In [None]:

def fetch_historical_stock_price(api_key, ticker, date_from, date_to):
    """
    Fetch historical stock price data from Financial Modeling Prep API.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    ticker (str): The stock ticker symbol (e.g., 'AAPL' for Apple).
    date_from (str): Start date for fetching data (YYYY-MM-DD).
    date_to (str): End date for fetching data (YYYY-MM-DD).

    Returns:
    list: The parsed JSON data containing historical stock prices.
    """
    url = f"https://financialmodelingprep.com/api/v3/historical-price-full/{ticker}?from={date_from}&to={date_to}&apikey={api_key}"
    return get_jsonparsed_data(url)

def save_historical_prices_to_file(all_prices, filename):
    """
    Save all historical stock prices to a single JSON file.

    Parameters:
    all_prices (list): The list of all historical stock prices to save.
    filename (str): The name of the file to save the historical stock prices.
    """
    with open(filename, 'w') as file:
        json.dump(all_prices, file, indent=4)
    print(f"All historical stock prices saved to {filename}")

def fetch_and_save_all_historical_prices(api_key, tickers, start_date, end_date):
    """
    Fetch and save historical stock prices for a list of tickers into a single JSON file over a specified timeframe.

    Parameters:
    api_key (str): Your API key for Financial Modeling Prep.
    tickers (list): A list of ticker symbols.
    start_date (str): Start date for fetching data (YYYY-MM-DD).
    end_date (str): End date for fetching data (YYYY-MM-DD).
    """
    all_historical_prices = []
    for ticker in tickers:
        print(f"Fetching historical prices for {ticker} from {start_date} to {end_date}...")
        historical_prices = fetch_historical_stock_price(api_key, ticker, start_date, end_date)
        if historical_prices:
            all_historical_prices.append({ticker: historical_prices})
        time.sleep(1)  # Sleep to avoid hitting API rate limits

    save_historical_prices_to_file(all_historical_prices, 'all_historical_prices.json')

def fetch_sp500_tickers():
    """
    Fetch the list of S&P 500 ticker symbols from Wikipedia.

    Returns:
    list: A list of S&P 500 ticker symbols.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch data from Wikipedia: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'constituents'})
    tickers = []

    for row in table.find_all('tr')[1:]:
        ticker = row.find_all('td')[0].text.strip()
        tickers.append(ticker)

    return tickers



In [None]:
# Example usage
api_key = '28a3eef526c43ab5888ab02222aada18'
sp500_tickers = fetch_sp500_tickers()

# Limit to the first ticker (MMM)
limited_tickers = sp500_tickers #[:1]

# Specify the date range
start_date = '2023-01-01'
end_date = '2023-12-31'

if limited_tickers:
    fetch_and_save_all_historical_prices(api_key, limited_tickers, start_date, end_date)

## Convert prices json data to pandas dataframe, csv, parquet

In [None]:
import json
import pandas as pd

def json_to_dataframe(json_file):
    """
    Convert JSON file to a pandas DataFrame in tidy data format.

    Parameters:
    json_file (str): The path to the JSON file.

    Returns:
    DataFrame: A pandas DataFrame in tidy data format.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    records = []
    for ticker_data in data:
        for ticker, content in ticker_data.items():
            historical_data = content["historical"]
            for price in historical_data:
                price['ticker'] = ticker
                records.append(price)
    
    df = pd.DataFrame(records)
    return df

def save_dataframe_to_csv(df, csv_file):
    """
    Save DataFrame to a CSV file.

    Parameters:
    df (DataFrame): The pandas DataFrame to save.
    csv_file (str): The name of the CSV file.
    """
    df.to_csv(csv_file, index=False)
    print(f"Data saved to {csv_file}")

def save_dataframe_to_parquet(df, parquet_file):
    """
    Save DataFrame to a Parquet file.

    Parameters:
    df (DataFrame): The pandas DataFrame to save.
    parquet_file (str): The name of the Parquet file.
    """
    df.to_parquet(parquet_file, index=False)
    print(f"Data saved to {parquet_file}")

# Example usage
json_file = 'all_historical_prices.json'
csv_file = 'all_historical_prices.csv'
parquet_file = 'all_historical_prices.parquet'

# Convert JSON to DataFrame
df = json_to_dataframe(json_file)

# Check if all columns are retained
print("Columns in DataFrame:", df.columns)

# Save DataFrame to CSV
save_dataframe_to_csv(df, csv_file)

# Save DataFrame to Parquet
save_dataframe_to_parquet(df, parquet_file)