In [13]:
import os

# Ensure the output directory exists
output_dir = 'data/test/linear/'
os.makedirs(output_dir, exist_ok=True)

# Process each ticker in the markets.csv file
for _, market_row in df_markets.iterrows():
    ticker = market_row['Ticker']
    
    # Convert latitude and longitude to float
    try:
        latitude = float(market_row['Latitude'])
        longitude = float(market_row['Longitude'])
    except ValueError as e:
        raise ValueError(f"Invalid latitude or longitude for ticker {ticker}. Ensure they are numeric values.") from e

    # Validate latitude and longitude ranges
    if not (-90.0 <= latitude <= 90.0):
        raise ValueError(f"Latitude {latitude} for ticker {ticker} is out of bounds. Must be between -90.0 and 90.0.")
    if not (-180.0 <= longitude <= 180.0):
        raise ValueError(f"Longitude {longitude} for ticker {ticker} is out of bounds. Must be between -180.0 and 180.0.")

    # Convert close time to datetime.time object
    try:
        close_time = pd.to_datetime(market_row['close'], format='%H:%M', errors='coerce').time()
        if pd.isna(close_time):
            raise ValueError(f"Invalid or missing close time for ticker {ticker}. Ensure it is in 'HH:MM' format.")
    except ValueError as e:
        raise ValueError(f"Invalid close time format for ticker {ticker}. Ensure it is in 'HH:MM' format.") from e

    # Find the time zone based on latitude and longitude
    timezone_str = tf.timezone_at(lat=latitude, lng=longitude)
    if timezone_str is None:
        raise ValueError(f"Could not determine time zone for ticker {ticker} at coordinates ({latitude}, {longitude})")
    tz = timezone(timezone_str)

    # Read the stock market data for the current ticker
    df_ticker = pd.read_csv(f'data/test/{ticker}.csv')
    df_ticker.rename(columns={'Date': 'date'}, inplace=True)  # Rename 'Date' column to 'date'

    # Convert df_ticker date column to the respective time zone
    df_ticker['date'] = pd.to_datetime(df_ticker['date']).dt.tz_localize(tz)

    # Normalize the earthquake data to the close time of the current ticker
    if df_eq['date'].dt.tz is None:
        # If the column is timezone-naive, localize it to UTC first
        df_eq['date_close'] = df_eq['date'].dt.tz_localize('UTC').dt.tz_convert(tz).dt.normalize() + pd.Timedelta(hours=close_time.hour, minutes=close_time.minute)
    else:
        # If the column is already timezone-aware, convert it directly
        df_eq['date_close'] = df_eq['date'].dt.tz_convert(tz).dt.normalize() + pd.Timedelta(hours=close_time.hour, minutes=close_time.minute)

    # Define the coordinates for the current market
    market_coords = (latitude, longitude)

    # Initialize lists for new columns
    num_list, sum_list, max_list, avg_list, min_dist_list, max_dist_list, sum_tsunami = [], [], [], [], [], [], []

    # Process each row in df_ticker
    for i, ticker_row in df_ticker.iterrows():
        curr_date = ticker_row['date']
        prev_date = df_ticker.iloc[i - 1]['date'] if i > 0 else None

        # Define time window: after previous day's close and before current day's close
        if prev_date is not None:
            eq_filtered = df_eq[(df_eq['date'] > prev_date + pd.Timedelta(hours=close_time.hour, minutes=close_time.minute)) & 
                               (df_eq['date'] <= curr_date + pd.Timedelta(hours=close_time.hour, minutes=close_time.minute))]
        else:
            eq_filtered = df_eq[df_eq['date'] <= curr_date + pd.Timedelta(hours=close_time.hour, minutes=close_time.minute)]

        # Compute required values
        num_list.append(len(eq_filtered))
        sum_list.append(eq_filtered['magnitudo'].sum() if not eq_filtered.empty else np.nan)
        max_list.append(eq_filtered['magnitudo'].max() if not eq_filtered.empty else np.nan)
        avg_list.append(eq_filtered['magnitudo'].mean() if not eq_filtered.empty else np.nan)
        sum_tsunami.append(eq_filtered['tsunami'].sum() if not eq_filtered.empty else np.nan)
        
        # Compute distances from the market's location
        if not eq_filtered.empty:
            distances = eq_filtered.apply(lambda row: geodesic((row['latitude'], row['longitude']), market_coords).km, axis=1)
            min_dist_list.append(distances.min())
            max_dist_list.append(distances.max())
        else:
            min_dist_list.append(np.nan)
            max_dist_list.append(np.nan)

    # Create the combined df
    df = df_ticker.copy()
    df['num'] = num_list
    df['sum'] = sum_list
    df['max'] = max_list
    df['avg'] = avg_list
    df['min_dist'] = min_dist_list
    df['max_dist'] = max_dist_list
    df['tsunami'] = sum_tsunami

    # Final preparations of the dataframe
    df[['sum', 'max', 'avg', 'min_dist', 'max_dist', 'tsunami']] = df[['sum', 'max', 'avg', 'min_dist', 'max_dist', 'tsunami']].fillna(0)
    df['rev_dist'] = df['min_dist'].apply(lambda x: 0 if x == 0 else 1 / x)

    # Save the dataframe to a CSV file named according to the ticker
    output_file = os.path.join(output_dir, f'{ticker}.csv')
    df.to_csv(output_file, index=False)
    print(f"Saved data for ticker {ticker} to {output_file}")

Saved data for ticker ^NYA to data/test/linear/^NYA.csv
Saved data for ticker ^IXIC to data/test/linear/^IXIC.csv
Saved data for ticker ^FTSE to data/test/linear/^FTSE.csv
Saved data for ticker ^NSEI to data/test/linear/^NSEI.csv
Saved data for ticker ^BSESN to data/test/linear/^BSESN.csv
Saved data for ticker ^N225 to data/test/linear/^N225.csv
Saved data for ticker 000001.SS to data/test/linear/000001.SS.csv
Saved data for ticker ^N100 to data/test/linear/^N100.csv
Saved data for ticker ^DJI to data/test/linear/^DJI.csv
Saved data for ticker ^GSPC to data/test/linear/^GSPC.csv
