In [60]:
import yfinance as yf
import pandas as pd

nasdaq_periods = [
    {"phase": "Bear", "start": "2000-03-01", "end": "2002-10-31"},
    {"phase": "Bull", "start": "2002-11-01", "end": "2007-10-31"},
    {"phase": "Bear", "start": "2007-10-01", "end": "2009-03-31"},
    {"phase": "Bull", "start": "2009-04-01", "end": "2020-02-28"},
    {"phase": "Bear", "start": "2020-02-01", "end": "2020-03-31"},
    {"phase": "Bull", "start": "2020-04-01", "end": "2021-11-30"},
    {"phase": "Bear", "start": "2021-11-01", "end": "2022-06-30"},
    {"phase": "Bull", "start": "2022-07-01", "end": "2100-12-31"},  # Keep end far in the future
]
sp500_periods = [
    {"phase": "Bear", "start": "1956-08-01", "end": "1957-10-31"},
    {"phase": "Bull", "start": "1957-11-01", "end": "1961-12-31"},
    {"phase": "Bear", "start": "1961-12-01", "end": "1962-06-30"},
    {"phase": "Bull", "start": "1962-07-01", "end": "1966-02-28"},
    {"phase": "Bear", "start": "1966-02-01", "end": "1966-10-31"},
    {"phase": "Bull", "start": "1966-11-01", "end": "1968-12-31"},
    {"phase": "Bear", "start": "1968-12-01", "end": "1970-05-31"},
    {"phase": "Bull", "start": "1970-06-01", "end": "1973-01-31"},
    {"phase": "Bear", "start": "1973-01-01", "end": "1974-10-31"},
    {"phase": "Bull", "start": "1974-11-01", "end": "1980-11-30"},
    {"phase": "Bear", "start": "1980-11-01", "end": "1982-08-31"},
    {"phase": "Bull", "start": "1982-09-01", "end": "1987-08-31"},
    {"phase": "Bear", "start": "1987-08-01", "end": "1987-12-31"},
    {"phase": "Bull", "start": "1988-01-01", "end": "1990-07-31"},
    {"phase": "Bear", "start": "1990-07-01", "end": "1990-10-31"},
    {"phase": "Bull", "start": "1990-11-01", "end": "2000-03-31"},
    {"phase": "Bear", "start": "2000-03-01", "end": "2002-10-31"},
    {"phase": "Bull", "start": "2002-11-01", "end": "2007-10-31"},
    {"phase": "Bear", "start": "2007-10-01", "end": "2009-03-31"},
    {"phase": "Bull", "start": "2009-04-01", "end": "2020-02-28"},
    {"phase": "Bear", "start": "2020-02-01", "end": "2020-03-31"},
    {"phase": "Bull", "start": "2020-04-01", "end": "2022-01-31"},
    {"phase": "Bear", "start": "2022-01-01", "end": "2022-10-31"},
    {"phase": "Bull", "start": "2022-11-01", "end": "2100-12-31"},  # Keep end far in the future
]
def label_market_historical(data: pd.DataFrame, periods: list) -> pd.DataFrame:
    """
    Label S&P 500 or NASDAQ data based on known historical bull/bear phases.
    phases: [
        {
          "phase": "Bear" or "Bull",
          "start": "YYYY-MM-DD",
          "end": "YYYY-MM-DD"
        },
        ...
    ]
    
    Returns a DataFrame with an additional 'Market_Label' column.
    """
    # Ensure Date is a datetime type
    data['Date'] = pd.to_datetime(data['Date'])

    # Initialize to 0 (Neutral or unknown)
    data['Market_Label'] = 1
    
    # Loop through the known periods and label accordingly
    for p in periods:
        # Convert date strings to Timestamps
        start_date = pd.to_datetime(p['start'])
        end_date   = pd.to_datetime(p['end'])
        
        # Create a mask of rows that fall within [start_date, end_date]
        mask = (data['Date'] >= start_date) & (data['Date'] <= end_date)
        
        # If phase == 'Bull', label = 1, if 'Bear', label = -1
        label_value = 1 if p['phase'].lower() == 'bull' else -1
        
        data.loc[mask, 'Market_Label'] = label_value

    return data


# Define stock indices to collect
indices = {"S&P 500": "SPY", "NASDAQ": "QQQ"}

# Define time range
start_date = "1900-01-01"
end_date = "2024-12-31"

# Download and save data
for name, ticker in indices.items():
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Flatten the MultiIndex into single-level column names
    data.columns = ['_'.join(col).strip() for col in data.columns.values]
    data.reset_index(inplace=True)
    data.rename(columns={'Close_SPY': 'Close', 'High_SPY': 'High', 'Low_SPY': 'Low', 
                   'Open_SPY': 'Open', 'Volume_SPY': 'Volume'}, inplace=True)
    data.rename(columns={'Close_QQQ': 'Close', 'High_QQQ': 'High', 'Low_QQQ': 'Low', 
                   'Open_QQQ': 'Open', 'Volume_QQQ': 'Volume'}, inplace=True)
    
    # Convert the 'Date' column to datetime
    data['Date'] = pd.to_datetime(data['Date'])

    # Verify the change
    print(data.dtypes)
    print(data.head())
    print(data.columns)
    data.to_csv(f"../data/raw/{name.replace(' ', '_').lower()}.csv", index=True)
    print(f"{name} data saved successfully.")

from fredapi import Fred
import pandas as pd

fred = Fred(api_key='e916710d165717e6348556cdce8111f3')

# Define economic indicators to collect
indicators = {
    "GDP Growth": "GDP",
    "Unemployment Rate": "UNRATE",
    "CPI": "CPIAUCSL",
    "PPI": "PPIACO",
    "M2 Money Supply": "M2",
    "Interest Rate": "DGS10"
}

# Fetch and save data
for name, series_id in indicators.items():
    data = fred.get_series(series_id)
    data = data.reset_index()
    data.columns = ['Date', name]
    #change CPI and PPI values as yoy percentage rate
    if name == 'CPI':
        data['CPI'] = data['CPI'].pct_change(periods=12)*100
    if name == 'PPI':
        data['PPI'] = data['PPI'].pct_change(periods=12)*100
    if name == 'GDP Growth':
        data['GDP Growth'] = data['GDP Growth'].pct_change(periods=12)*100
    if name == 'M2 Money Supply':
        data['M2 Money Supply'] = data['M2 Money Supply'].pct_change(periods=12)*100
    data.to_csv(f"../data/raw/{name.replace(' ', '_').lower()}.csv", index=False)
    print(f"{name} data saved successfully.")

# Load your S&P 500 data
sp500 = pd.read_csv("../data/raw/s&p_500.csv", parse_dates=['Date'])

# Label using historical phases
sp500_labeled = label_market_historical(sp500, sp500_periods)

# Save the labeled dataset
sp500_labeled.to_csv("../data/processed/labeled_sp500.csv", index=False)

print(sp500_labeled.head())
print(sp500_labeled.tail())

# Load your NASDAQ data
nasdaq = pd.read_csv("../data/raw/nasdaq.csv", parse_dates=['Date'])

# Label using historical phases
nasdaq_labeled = label_market_historical(nasdaq, nasdaq_periods)

# Save the labeled dataset
nasdaq_labeled.to_csv("../data/processed/labeled_qqq.csv", index=False)

print(nasdaq_labeled.head())
print(nasdaq_labeled.tail())


# Load labeled market data
labeled_sp500 = pd.read_csv("../data/processed/labeled_sp500.csv", parse_dates=['Date'])
labeled_qqq = pd.read_csv("../data/processed/labeled_qqq.csv", parse_dates=['Date'])

# Load economic data
gdp = pd.read_csv("../data/raw/gdp_growth.csv", parse_dates=['Date'])
cpi = pd.read_csv("../data/raw/cpi.csv", parse_dates=['Date'])
interest_rate = pd.read_csv("../data/raw/interest_rate.csv", parse_dates=['Date'])
m2 = pd.read_csv("../data/raw/m2_money_supply.csv", parse_dates=['Date'])
ppi = pd.read_csv("../data/raw/ppi.csv", parse_dates=['Date'])
unemployment_rate = pd.read_csv("../data/raw/unemployment_rate.csv", parse_dates=['Date'])

# Merge datasets on Date: S&P500
merged_data_sp500 = labeled_sp500.merge(gdp, on='Date', how='left')
merged_data_sp500 = merged_data_sp500.merge(cpi, on='Date', how='left')
merged_data_sp500 = merged_data_sp500.merge(interest_rate, on='Date', how='left')
merged_data_sp500 = merged_data_sp500.merge(m2, on='Date', how='left')
merged_data_sp500 = merged_data_sp500.merge(ppi, on='Date', how='left')
merged_data_sp500 = merged_data_sp500.merge(unemployment_rate, on='Date', how='left')

# Save final combined dataset: S&P500
merged_data_sp500.to_csv("../data/processed/combined_data_sp500.csv", index=False)
print("Combined dataset saved successfully.")

# Merge datasets on Date: NASDAQ
merged_data_nasdaq = labeled_qqq.merge(gdp, on='Date', how='left')
merged_data_nasdaq = merged_data_nasdaq.merge(cpi, on='Date', how='left')
merged_data_nasdaq = merged_data_nasdaq.merge(interest_rate, on='Date', how='left')
merged_data_nasdaq = merged_data_nasdaq.merge(m2, on='Date', how='left')
merged_data_nasdaq = merged_data_nasdaq.merge(ppi, on='Date', how='left')
merged_data_nasdaq = merged_data_nasdaq.merge(unemployment_rate, on='Date', how='left')

# Save final combined dataset: NASDAQ
merged_data_nasdaq.to_csv("../data/processed/combined_data_nasdaq.csv", index=False)
print("Combined dataset saved successfully.")
print(merged_data_sp500.head())
print(merged_data_nasdaq.head())



[*********************100%***********************]  1 of 1 completed


Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object
        Date      Close       High        Low       Open   Volume
0 1993-01-29  24.526073  24.543517  24.421410  24.543517  1003200
1 1993-02-01  24.700510  24.700510  24.543515  24.543515   480500
2 1993-02-02  24.752853  24.770297  24.630746  24.683078   201300
3 1993-02-03  25.014505  25.031949  24.770291  24.787735   529400
4 1993-02-04  25.119164  25.171495  24.822618  25.101720   531500
Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
S&P 500 data saved successfully.


[*********************100%***********************]  1 of 1 completed


Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object
        Date      Close       High        Low       Open   Volume
0 1999-03-10  43.417278  43.496992  42.752999  43.470421  5232000
1 1999-03-11  43.629845  43.988555  42.779568  43.736129  9688600
2 1999-03-12  42.567005  43.496996  42.221580  43.470425  8743600
3 1999-03-15  43.789280  43.842422  42.434151  42.885860  6369000
4 1999-03-16  44.161274  44.347272  43.496995  43.975276  4905800
Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
NASDAQ data saved successfully.
GDP Growth data saved successfully.
Unemployment Rate data saved successfully.
CPI data saved successfully.
PPI data saved successfully.
M2 Money Supply data saved successfully.
Interest Rate data saved successfully.
   Unnamed: 0       Date      Close       High        Low       Open   Volume  \
0           0 1993-01-29  24.526073 

######