In [1]:
!pip install quandl



In [2]:
# Imports & Settings

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Standard library imports
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

# Third-party library imports
import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

# Configure pandas to display full DataFrames
pd.set_option('display.expand_frame_repr', False)

# Set random seed for reproducibility
np.random.seed(42)

## Set Data Store Path

This notebook uses a central location to store all downloaded and processed data. The `DATA_STORE` variable defines the path to an HDF5 file that will contain multiple datasets.

**Note:** If you prefer to store the data in a different location, modify the `DATA_STORE` path below and update all references to this path in other notebooks.

In [3]:
# Define the path for the HDF5 file that will store all datasets
DATA_STORE = Path('assets.h5')

# Create the directory if it doesn't exist
DATA_STORE.parent.mkdir(parents=True, exist_ok=True)

print(f"Data will be stored in: {DATA_STORE.absolute()}")

Data will be stored in: /content/assets.h5


## Quandl Wiki Prices Dataset

### Background
- Quandl, a financial data provider, was [acquired by NASDAQ](https://www.nasdaq.com/about/press-center/nasdaq-acquires-quandl-advance-use-alternative-data) in late 2018.
- In 2021, NASDAQ [integrated Quandl's data platform](https://data.nasdaq.com/).

### Dataset Description
- Contains stock prices, dividends, and splits for 3000 US publicly-traded companies.
- Available at: [NASDAQ Data Link](https://data.nasdaq.com/databases/WIKIP/documentation)
- Historical data useful for demonstrating machine learning applications in finance.

### Important Notes
1. This dataset is no longer actively updated (last update: April 11, 2018).
2. Use this data for learning and initial testing only.
3. For production or current analysis, use up-to-date, professional-grade data sources.

### Data Acquisition Steps
1. Create a free [NASDAQ account](https://data.nasdaq.com/sign-up)
2. [Download](https://data.nasdaq.com/tables/WIKIP/WIKI-PRICES/export) the entire WIKI/PRICES dataset
3. Extract the .zip file
4. Move the extracted file to this notebook's directory and rename it to `wiki_prices.csv`
5. Run the code below to process and store the data in HDF5 format

In [4]:
import pandas as pd
from pathlib import Path
from google.colab import drive
import zipfile
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

def get_wiki_prices_file():
    """
    Attempt to get the WIKI Prices file from Google Drive, unzip it, and rename it.
    If unsuccessful, prompt the user to download manually.

    Returns:
    Path: Path to the CSV file
    """
    drive_file_path = Path('/content/drive/MyDrive/ML4T/WIKI_PRICES_212b326a081eacca455e13140d7bb9db.zip')

    if drive_file_path.exists():
        print("Found WIKI Prices zip file in Google Drive. Extracting...")
        with zipfile.ZipFile(drive_file_path, 'r') as zip_ref:
            zip_ref.extractall('.')

        # Find the extracted CSV file
        csv_file = next(Path('.').glob('*.csv'), None)
        if csv_file:
            # Rename the file
            new_file_name = 'wiki_prices.csv'
            csv_file.rename(new_file_name)
            print(f"File extracted and renamed to {new_file_name}")
            return Path(new_file_name)
        else:
            print("Couldn't find the CSV file in the extracted contents.")

    print("WIKI Prices file not found in Google Drive or extraction failed.")
    print("Please download the file manually using the following steps:")
    print("1. Go to https://data.nasdaq.com/tables/WIKIP/WIKI-PRICES/export")
    print("2. Download the entire WIKI/PRICES data")
    print("3. Extract the .zip file")
    print("4. Move the extracted CSV file to this notebook's directory and rename it to 'wiki_prices.csv'")

    return None

def process_quandl_wiki_prices(file_path='wiki_prices.csv'):
    """
    Process the Quandl WIKI Prices dataset from a CSV file and store it in HDF5 format.

    Args:
    file_path (str): Path to the CSV file containing the WIKI Prices data.

    Returns:
    pd.DataFrame: Processed DataFrame of WIKI Prices data.
    """
    csv_path = Path(file_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"The file {file_path} does not exist. Please ensure you've downloaded the CSV file and placed it in the correct directory.")

    print(f"Processing {file_path}...")

    # Read CSV file, parse dates, and set multi-index
    df = pd.read_csv(csv_path,
                     parse_dates=['date'],
                     index_col=['date', 'ticker'],
                     infer_datetime_format=True)

    # Sort the index for efficient data access
    df = df.sort_index()

    # Print DataFrame info for verification
    print("\nWIKI Prices Dataset Info:")
    print(df.info())

    # Print null value counts separately
    print("\nNull value counts:")
    print(df.isnull().sum())

    # Store the data in HDF5 format
    with pd.HDFStore(DATA_STORE) as store:
        store.put('quandl/wiki/prices', df)
        print(f"\nData stored in {DATA_STORE} under 'quandl/wiki/prices'")

    return df

# Main execution
try:
    csv_file_path = get_wiki_prices_file()
    if csv_file_path:
        wiki_prices_df = process_quandl_wiki_prices(csv_file_path)
        print("\nQuandl WIKI Prices data processed successfully.")
    else:
        print("\nPlease run this cell again after manually downloading and placing the CSV file.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found WIKI Prices zip file in Google Drive. Extracting...
File extracted and renamed to wiki_prices.csv
Processing wiki_prices.csv...
An error occurred: Missing column provided to 'parse_dates': 'date'


In [5]:
# Wiki Prices Metadata

def process_wiki_stocks_metadata(file_path='wiki_stocks.csv'):
    """
    Process the Wiki Stocks metadata and store it in HDF5 format.

    Args:
    file_path (str): Path to the CSV file containing the Wiki Stocks metadata.

    Returns:
    pd.DataFrame: Processed DataFrame of Wiki Stocks metadata.
    """
    csv_path = Path(file_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"The file {file_path} does not exist. Please ensure you've downloaded the CSV file and placed it in the correct directory.")

    print(f"Processing {file_path}...")

    # Read CSV file
    df = pd.read_csv(csv_path)

    # Print DataFrame info for verification
    print("\nWiki Stocks Metadata Info:")
    print(df.info())

    # Print sample data
    print("\nSample data:")
    print(df.head())

    # Store the data in HDF5 format
    with pd.HDFStore(DATA_STORE) as store:
        store.put('quandl/wiki/stocks', df.set_index('code'))
        print(f"\nData stored in {DATA_STORE} under 'quandl/wiki/stocks'")

    return df


In [6]:
# S&P 500 Prices

import pandas_datareader.data as web
from datetime import datetime

def fetch_sp500_prices(start_date='2010-01-01', end_date=None):
    """
    Fetch S&P 500 prices from FRED and store them in HDF5 format.

    Args:
    start_date (str): Start date for data fetch (default: '2010-01-01')
    end_date (str): End date for data fetch (default: current date)

    Returns:
    pd.DataFrame: DataFrame of S&P 500 prices
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')

    print(f"Fetching S&P 500 prices from {start_date} to {end_date}...")

    try:
        df = web.DataReader(name='SP500', data_source='fred', start=start_date, end=end_date)
        df = df.rename(columns={'SP500': 'close'})

        print("\nS&P 500 Prices Info:")
        print(df.info())

        print("\nSample data:")
        print(df.head())

        with pd.HDFStore(DATA_STORE) as store:
            store.put('sp500/fred', df)
            print(f"\nData stored in {DATA_STORE} under 'sp500/fred'")

        return df

    except Exception as e:
        print(f"An error occurred while fetching S&P 500 prices: {str(e)}")
        return None


In [7]:
from google.colab import drive
import shutil

# Mount Google Drive if not already mounted
try:
    drive.mount('/content/drive')
except:
    print("Google Drive is already mounted.")

def check_and_move_wiki_stocks():
    """
    Check for wiki_stocks.csv in Google Drive and move it to the current directory if found.
    """
    source_path = Path('/content/drive/MyDrive/ML4T/wiki_stocks.csv')
    destination_path = Path('wiki_stocks.csv')

    if source_path.exists():
        print(f"Found wiki_stocks.csv in Google Drive. Moving to {destination_path}")
        shutil.copy(source_path, destination_path)
        print("File moved successfully.")
        return True
    else:
        print("wiki_stocks.csv not found in Google Drive.")
        return False

# Check for wiki_stocks.csv and move it if found
file_moved = check_and_move_wiki_stocks()

if file_moved:
    # Process the Wiki Stocks metadata
    try:
        wiki_stocks_df = process_wiki_stocks_metadata()
        print("\nWiki Stocks metadata processed successfully.")
    except Exception as e:
        print(f"An error occurred while processing the Wiki Stocks metadata: {str(e)}")
else:
    print("\nPlease ensure you have the 'wiki_stocks.csv' file in your Google Drive 'ML4T' folder.")
    print("If the file is not available, you may need to obtain it from another source.")

# Proceed with S&P 500 Prices fetching
sp500_prices = fetch_sp500_prices()

if sp500_prices is not None:
    print("\nS&P 500 prices fetched and stored successfully.")
else:
    print("\nFailed to fetch S&P 500 prices. Please check your internet connection and try again.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found wiki_stocks.csv in Google Drive. Moving to wiki_stocks.csv
File moved successfully.
Processing wiki_stocks.csv...

Wiki Stocks Metadata Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   code    3199 non-null   object
 1   name    3199 non-null   object
dtypes: object(2)
memory usage: 50.1+ KB
None

Sample data:
   code                          name
0     A     Agilent Technologies Inc.
1    AA                    Alcoa Inc.
2   AAL  American Airlines Group Inc.
3  AAMC   Altisource Asset Management
4   AAN                  Aaron's Inc.

Data stored in assets.h5 under 'quandl/wiki/stocks'

Wiki Stocks metadata processed successfully.
Fetching S&P 500 prices from 2010-01-01 to 2024-07-25...

S&P 500 Prices Info:
<class 

In [8]:
import pandas as pd
from pathlib import Path
import shutil

def check_and_move_sp500_stooq_file():
    """
    Check for ^spx_d_daily.csv in Google Drive and move it to the current directory if found.
    """
    source_path = Path('/content/drive/MyDrive/ML4T/^spx_d_daily.csv')
    destination_path = Path('^spx_d.csv')

    if source_path.exists():
        print(f"Found ^spx_d_daily.csv in Google Drive. Moving to {destination_path}")
        shutil.copy(source_path, destination_path)
        print("File moved successfully.")
        return True
    else:
        print("^spx_d_daily.csv not found in Google Drive.")
        return False

def process_sp500_stooq(file_path='^spx_d.csv', start_year='1950', end_year='2019'):
    """
    Process S&P 500 data from Stooq.com and store it in HDF5 format.

    Args:
    file_path (str): Path to the CSV file containing S&P 500 data from Stooq.
    start_year (str): Start year for data processing.
    end_year (str): End year for data processing.

    Returns:
    pd.DataFrame: Processed DataFrame of S&P 500 data.
    """
    csv_path = Path(file_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"The file {file_path} does not exist. Please download the S&P 500 data from Stooq.com.")

    print(f"Processing {file_path}...")

    # Read CSV file, parse dates, set index, and filter date range
    df = (pd.read_csv(csv_path, index_col=0, parse_dates=True)
          .loc[f'{start_year}':f'{end_year}']
          .rename(columns=str.lower))

    # Print DataFrame info for verification
    print("\nS&P 500 (Stooq) Dataset Info:")
    print(df.info())

    # Print sample data
    print("\nSample data:")
    print(df.head())

    # Store the data in HDF5 format
    with pd.HDFStore(DATA_STORE) as store:
        store.put('sp500/stooq', df)
        print(f"\nData stored in {DATA_STORE} under 'sp500/stooq'")

    return df

# First, check and move the file if it's in Google Drive
file_moved = check_and_move_sp500_stooq_file()

# Process the S&P 500 data from Stooq
if file_moved or Path('^spx_d.csv').exists():
    try:
        sp500_stooq_df = process_sp500_stooq()
        print("\nS&P 500 data from Stooq processed successfully.")
    except Exception as e:
        print(f"An error occurred while processing the S&P 500 data: {str(e)}")
else:
    print("\nS&P 500 data file (^spx_d.csv) not found.")
    print("Please download the S&P 500 data from Stooq.com and place it in the notebook directory or your Google Drive ML4T folder as '^spx_d_daily.csv'.")
    print("You can download the data from: https://stooq.com/q/d/?s=%5Espx")

Found ^spx_d_daily.csv in Google Drive. Moving to ^spx_d.csv
File moved successfully.
Processing ^spx_d.csv...

S&P 500 (Stooq) Dataset Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17700 entries, 1950-01-03 to 2019-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    17700 non-null  float64
 1   high    17700 non-null  float64
 2   low     17700 non-null  float64
 3   close   17700 non-null  float64
 4   volume  17700 non-null  float64
dtypes: float64(5)
memory usage: 829.7 KB
None

Sample data:
             open   high    low  close     volume
Date                                             
1950-01-03  16.66  16.66  16.66  16.66   700000.0
1950-01-04  16.85  16.85  16.85  16.85  1050000.0
1950-01-05  16.93  16.93  16.93  16.93  1416667.0
1950-01-06  16.98  16.98  16.98  16.98  1116667.0
1950-01-07  17.09  17.09  17.09  17.09  1116667.0

Data stored in assets.h5 under 'sp500/stooq'

S&P 500 data fro

In [9]:
import pandas as pd
import requests
from io import StringIO

def fetch_sp500_constituents():
    """
    Fetch current S&P 500 constituents from Wikipedia and store in HDF5 format.

    Returns:
    pd.DataFrame: DataFrame of S&P 500 constituents
    """
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

    print("Fetching S&P 500 constituents from Wikipedia...")

    try:
        # Read HTML tables from Wikipedia
        tables = pd.read_html(url, header=0)
        df = tables[0]

        # Print the columns to see what we're dealing with
        print("Columns in the fetched data:")
        print(df.columns)

        # Dynamically rename columns based on the number of columns
        if len(df.columns) == 9:
            df.columns = ['ticker', 'name', 'sec_filings', 'gics_sector', 'gics_sub_industry',
                          'headquarters', 'date_added', 'cik', 'founded']
        elif len(df.columns) == 8:
            df.columns = ['ticker', 'name', 'sec_filings', 'gics_sector', 'gics_sub_industry',
                          'headquarters', 'date_added', 'cik']
        else:
            raise ValueError(f"Unexpected number of columns: {len(df.columns)}")

        # Drop 'sec_filings' column and set 'ticker' as index
        df = df.drop('sec_filings', axis=1).set_index('ticker')

        print("\nS&P 500 Constituents Info:")
        print(df.info())

        print("\nSample data:")
        print(df.head())

        # Store the data in HDF5 format
        with pd.HDFStore(DATA_STORE) as store:
            store.put('sp500/stocks', df)
            print(f"\nData stored in {DATA_STORE} under 'sp500/stocks'")

        return df

    except Exception as e:
        print(f"An error occurred while fetching S&P 500 constituents: {str(e)}")
        print("Columns in the fetched data:")
        print(df.columns if 'df' in locals() else "DataFrame not created")
        return None

# Fetch S&P 500 constituents
sp500_constituents = fetch_sp500_constituents()

if sp500_constituents is not None:
    print("\nS&P 500 constituents fetched and stored successfully.")
else:
    print("\nFailed to fetch S&P 500 constituents. Please check the error message above for details.")

Fetching S&P 500 constituents from Wikipedia...
Columns in the fetched data:
Index(['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry',
       'Headquarters Location', 'Date added', 'CIK', 'Founded'],
      dtype='object')

S&P 500 Constituents Info:
<class 'pandas.core.frame.DataFrame'>
Index: 503 entries, MMM to ZTS
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               503 non-null    object
 1   gics_sector        503 non-null    object
 2   gics_sub_industry  503 non-null    object
 3   headquarters       503 non-null    object
 4   date_added         503 non-null    int64 
 5   cik                503 non-null    object
dtypes: int64(1), object(5)
memory usage: 27.5+ KB
None

Sample data:
               name                     gics_sector        gics_sub_industry headquarters  date_added          cik
ticker                                                                           

In [10]:
import pandas as pd
from pathlib import Path
import shutil

def move_and_rename_exchange_files():
    """
    Move and rename exchange files from Google Drive to the current directory.
    """
    exchanges = ['AMEX', 'NYSE', 'NASDAQ']
    renamed_files = []

    for exchange in exchanges:
        source_path = Path(f'/content/drive/MyDrive/ML4T/nasdaq_screener_{exchange}.csv')
        destination_path = Path(f'{exchange.lower()}_stocks.csv')

        if source_path.exists():
            print(f"Found {source_path.name} in Google Drive. Moving to {destination_path}")
            shutil.copy(source_path, destination_path)
            renamed_files.append(destination_path)
            print(f"File moved and renamed successfully to {destination_path}")
        else:
            print(f"{source_path.name} not found in Google Drive.")

    return renamed_files

In [12]:
import pandas as pd
import numpy as np
from pathlib import Path

def process_us_equities_metadata(file_paths):
    """
    Process US equities metadata from multiple CSV files, including market cap conversion.

    Args:
    file_paths (list): List of paths to the CSV files containing US equities metadata.

    Returns:
    pd.DataFrame: Processed DataFrame of US equities metadata.
    """
    dfs = []

    for file_path in file_paths:
        print(f"Processing {file_path}...")
        df = pd.read_csv(file_path)
        print(f"Columns in {file_path}:")
        print(df.columns)
        dfs.append(df)

    # Combine data from all exchanges
    df = pd.concat(dfs, ignore_index=True)

    # Clean up the data
    df = df.rename(columns=str.lower)

    # Check if 'ticker' column exists, if not, try to find a suitable column
    if 'ticker' not in df.columns:
        possible_ticker_columns = ['symbol', 'stock symbol', 'ticker symbol']
        for col in possible_ticker_columns:
            if col in df.columns:
                df = df.rename(columns={col: 'ticker'})
                break
        else:
            raise KeyError(f"Could not find a suitable column for ticker. Available columns are: {df.columns}")

    df = df.set_index('ticker')
    df = df[~df.index.duplicated(keep='first')]

    print("\nUS Equities Metadata Info:")
    print(df.info())

    print("\nSample data before market cap conversion:")
    print(df.head())

    # Convert market cap to numeric
    def convert_market_cap(value):
        if pd.isna(value):
            return np.nan
        if isinstance(value, (int, float)):
            return value
        value = str(value).replace('$', '')
        if value.endswith('T'):
            return float(value[:-1]) * 1e12
        elif value.endswith('B'):
            return float(value[:-1]) * 1e9
        elif value.endswith('M'):
            return float(value[:-1]) * 1e6
        else:
            try:
                return float(value)
            except ValueError:
                return np.nan

    # Check if 'market cap' column exists, if not, try to find a suitable column
    market_cap_column = 'market cap'
    if market_cap_column not in df.columns:
        possible_market_cap_columns = ['marketcap', 'market capitalization', 'cap']
        for col in possible_market_cap_columns:
            if col in df.columns:
                market_cap_column = col
                break
        else:
            print(f"Warning: Could not find a suitable column for market cap. Available columns are: {df.columns}")
            market_cap_column = None

    if market_cap_column:
        df[market_cap_column] = df[market_cap_column].apply(convert_market_cap)

        print("\nSample data after market cap conversion:")
        print(df.head())

        print("\nMarket Cap Statistics:")
        print(df[market_cap_column].describe())
    else:
        print("Market cap conversion skipped due to missing column.")

    # Store the data in HDF5 format
    with pd.HDFStore(DATA_STORE) as store:
        store.put('us_equities/stocks', df)
        print(f"\nData stored in {DATA_STORE} under 'us_equities/stocks'")

    return df

# Use this function after moving and renaming the files
renamed_files = move_and_rename_exchange_files()

if renamed_files:
    # Process US equities metadata
    us_equities_metadata = process_us_equities_metadata(renamed_files)
    print("\nUS equities metadata processed and stored successfully.")
else:
    print("\nNo exchange files found in Google Drive. Please ensure you've uploaded the files with the correct names.")

Found nasdaq_screener_AMEX.csv in Google Drive. Moving to amex_stocks.csv
File moved and renamed successfully to amex_stocks.csv
Found nasdaq_screener_NYSE.csv in Google Drive. Moving to nyse_stocks.csv
File moved and renamed successfully to nyse_stocks.csv
Found nasdaq_screener_NASDAQ.csv in Google Drive. Moving to nasdaq_stocks.csv
File moved and renamed successfully to nasdaq_stocks.csv
Processing amex_stocks.csv...
Columns in amex_stocks.csv:
Index(['Symbol', 'Name', 'Last Sale', 'Net Change', '% Change', 'Market Cap',
       'Country', 'IPO Year', 'Volume', 'Sector', 'Industry'],
      dtype='object')
Processing nyse_stocks.csv...
Columns in nyse_stocks.csv:
Index(['Symbol', 'Name', 'Last Sale', 'Net Change', '% Change', 'Market Cap',
       'Country', 'IPO Year', 'Volume', 'Sector', 'Industry'],
      dtype='object')
Processing nasdaq_stocks.csv...
Columns in nasdaq_stocks.csv:
Index(['Symbol', 'Name', 'Last Sale', 'Net Change', '% Change', 'Market Cap',
       'Country', 'IPO Ye

In [13]:
from sklearn.datasets import fetch_openml
import numpy as np
from pathlib import Path

def fetch_and_save_mnist():
    """
    Fetch MNIST data from OpenML and save it locally.
    """
    print("Fetching MNIST data from OpenML...")
    mnist = fetch_openml('mnist_784', version=1, as_frame=False)

    print("MNIST data fetched. Saving locally...")

    mnist_path = Path('mnist')
    mnist_path.mkdir(exist_ok=True)

    np.save(mnist_path / 'data.npy', mnist.data.astype(np.uint8))
    np.save(mnist_path / 'target.npy', mnist.target.astype(np.uint8))

    with open(mnist_path / 'description.txt', 'w') as f:
        f.write(mnist.DESCR)

    print("MNIST data saved locally in 'mnist' directory.")
    print(f"Data shape: {mnist.data.shape}")
    print(f"Target shape: {mnist.target.shape}")

# Fetch and save MNIST data
fetch_and_save_mnist()

# Verify saved files
mnist_path = Path('mnist')
if mnist_path.exists():
    print("\nVerifying saved MNIST files:")
    for file in ['data.npy', 'target.npy', 'description.txt']:
        if (mnist_path / file).exists():
            print(f"  {file} exists")
        else:
            print(f"  {file} is missing")
else:
    print("\nMNIST directory not found. Data may not have been saved correctly.")

Fetching MNIST data from OpenML...
MNIST data fetched. Saving locally...
MNIST data saved locally in 'mnist' directory.
Data shape: (70000, 784)
Target shape: (70000,)

Verifying saved MNIST files:
  data.npy exists
  target.npy exists
  description.txt exists


In [14]:
def fetch_and_save_fashion_mnist():
    """
    Fetch Fashion MNIST data from OpenML and save it locally.
    """
    print("Fetching Fashion MNIST data from OpenML...")
    fashion_mnist = fetch_openml('Fashion-MNIST', version=1, as_frame=False)

    print("Fashion MNIST data fetched. Saving locally...")

    fashion_path = Path('fashion_mnist')
    fashion_path.mkdir(exist_ok=True)

    np.save(fashion_path / 'data.npy', fashion_mnist.data.astype(np.uint8))
    np.save(fashion_path / 'target.npy', fashion_mnist.target.astype(np.uint8))

    with open(fashion_path / 'description.txt', 'w') as f:
        f.write(fashion_mnist.DESCR)

    # Save label dictionary
    label_dict = {
        0: 'T-shirt/top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat',
        5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'
    }
    pd.Series(label_dict).to_csv(fashion_path / 'label_dict.csv', header=False)

    print("Fashion MNIST data saved locally in 'fashion_mnist' directory.")
    print(f"Data shape: {fashion_mnist.data.shape}")
    print(f"Target shape: {fashion_mnist.target.shape}")

# Fetch and save Fashion MNIST data
fetch_and_save_fashion_mnist()

# Verify saved files
fashion_path = Path('fashion_mnist')
if fashion_path.exists():
    print("\nVerifying saved Fashion MNIST files:")
    for file in ['data.npy', 'target.npy', 'description.txt', 'label_dict.csv']:
        if (fashion_path / file).exists():
            print(f"  {file} exists")
        else:
            print(f"  {file} is missing")
else:
    print("\nFashion MNIST directory not found. Data may not have been saved correctly.")

Fetching Fashion MNIST data from OpenML...
Fashion MNIST data fetched. Saving locally...
Fashion MNIST data saved locally in 'fashion_mnist' directory.
Data shape: (70000, 784)
Target shape: (70000,)

Verifying saved Fashion MNIST files:
  data.npy exists
  target.npy exists
  description.txt exists
  label_dict.csv exists


In [15]:
import pandas_datareader.data as web
from datetime import datetime
import pandas as pd

def fetch_bond_price_indexes(start_date='2000-01-01', end_date=None):
    """
    Fetch bond price indexes from FRED.

    Args:
    start_date (str): Start date for data fetch (default: '2000-01-01')
    end_date (str): End date for data fetch (default: current date)

    Returns:
    pd.DataFrame: DataFrame of bond price indexes
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')

    print(f"Fetching bond price indexes from {start_date} to {end_date}...")

    securities = {
        'BAMLCC0A0CMTRIV'   : 'US Corp Master TRI',
        'BAMLHYH0A0HYM2TRIV': 'US High Yield TRI',
        'BAMLEMCBPITRIV'    : 'Emerging Markets Corporate Plus TRI',
        'DGS10'             : '10-Year Treasury CMR',
    }

    dfs = []

    for ticker, name in securities.items():
        try:
            df = web.DataReader(ticker, 'fred', start=start_date, end=end_date)
            df.columns = [name]
            dfs.append(df)
            print(f"Successfully fetched data for {name}")
        except Exception as e:
            print(f"Error fetching data for {name}: {str(e)}")

    if not dfs:
        print("Failed to fetch data for any securities.")
        return None

    # Combine all dataframes
    combined_df = pd.concat(dfs, axis=1)

    # Resample to business days and forward fill missing values
    combined_df = combined_df.resample('B').ffill()

    print("\nBond Price Indexes Info:")
    print(combined_df.info())

    print("\nSample data:")
    print(combined_df.head())

    # Store the data in HDF5 format
    with pd.HDFStore(DATA_STORE) as store:
        store.put('fred/assets', combined_df)
        print(f"\nData stored in {DATA_STORE} under 'fred/assets'")

    return combined_df

# Fetch bond price indexes
bond_price_indexes = fetch_bond_price_indexes()

if bond_price_indexes is not None:
    print("\nBond price indexes fetched and stored successfully.")
else:
    print("\nFailed to fetch bond price indexes. Please check your internet connection and try again.")

Fetching bond price indexes from 2000-01-01 to 2024-07-25...
Successfully fetched data for US Corp Master TRI
Successfully fetched data for US High Yield TRI
Successfully fetched data for Emerging Markets Corporate Plus TRI
Successfully fetched data for 10-Year Treasury CMR

Bond Price Indexes Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6407 entries, 2000-01-03 to 2024-07-23
Freq: B
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   US Corp Master TRI                   6329 non-null   float64
 1   US High Yield TRI                    6329 non-null   float64
 2   Emerging Markets Corporate Plus TRI  6329 non-null   float64
 3   10-Year Treasury CMR                 6143 non-null   float64
dtypes: float64(4)
memory usage: 250.3 KB
None

Sample data:
            US Corp Master TRI  US High Yield TRI  Emerging Markets Corporate Plus TRI  10-Year Treasury CMR
D

In [16]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the source and destination paths
source_path = 'assets.h5'
destination_folder = '/content/drive/MyDrive/ML4T'
destination_path = os.path.join(destination_folder, 'assets.h5')

# Create the ML4T folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Copy the file
try:
    shutil.copy(source_path, destination_path)
    print(f"Successfully copied {source_path} to {destination_path}")
except FileNotFoundError:
    print(f"Error: The source file {source_path} was not found.")
except PermissionError:
    print(f"Error: Permission denied. Unable to copy to {destination_path}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

# Verify the file was copied
if os.path.exists(destination_path):
    print(f"Verified: {destination_path} exists in Google Drive")
else:
    print(f"Warning: {destination_path} was not found in Google Drive after copying")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully copied assets.h5 to /content/drive/MyDrive/ML4T/assets.h5
Verified: /content/drive/MyDrive/ML4T/assets.h5 exists in Google Drive
