In [None]:
import re

def replace_currency_symbols(text):
    # Dictionary mapping currency symbols to their acronyms
    currency_dict = {
        '£': 'GBP ',   # British Pound
        '€': 'EUR ',   # Euro
        '$': 'USD ',   # US Dollar
        '¥': 'JPY ',   # Japanese Yen
        '₹': 'INR ',   # Indian Rupee
        '₽': 'RUB ',   # Russian Ruble
        '₩': 'KRW ',   # South Korean Won
        '฿': 'THB ',   # Thai Baht
        '₺': 'TRY ',   # Turkish Lira
        '₪': 'ILS '    # Israeli Shekel
    }

    # Regular expression pattern to match any of the currency symbols
    pattern = re.compile('|'.join(re.escape(symbol) for symbol in currency_dict.keys()))

    # Function to replace a matched symbol with its acronym
    def replace_symbol(match):
        return currency_dict[match.group(0)]

    # Replace all currency symbols in the text
    return pattern.sub(replace_symbol, text)
def clean_text(text):
    # Add a space before and after punctuation marks
    text = re.sub(r'([.,!?;:])', r' \1 ', replace_currency_symbols(text))
    
    # Remove newlines and tabs
    text = text.replace('\n', ' ').replace('\t', ' ')
    
    # Remove any extra spaces that might have been introduced
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text
# Example usage
text = "The price is £50, €30, and $20."
result = replace_currency_symbols(text)
print(result)  # Output: "The price is GBP50, EUR30, and USD20."


In [None]:
import pandas as pd

cnbc = pd.read_csv('data/raw_data/cnbc_headlines.csv')
guardian = pd.read_csv('data/raw_data/guardian_headlines.csv')
reuters = pd.read_csv('data/raw_data/reuters_headlines.csv') 


In [None]:
print(len(cnbc) + len(guardian) + len(reuters))

In [None]:
cnbc['Time'] = pd.to_datetime(cnbc['Time'].replace('ET ',''), errors = 'coerce').dt.strftime('%Y-%m-%d')
guardian['Time'] = pd.to_datetime(guardian['Time'], errors = 'coerce').dt.strftime('%Y-%m-%d')
reuters['Time'] = pd.to_datetime(reuters['Time'], errors = 'coerce').dt.strftime('%Y-%m-%d')

In [None]:
cnbc = cnbc.dropna()
guardian = guardian.dropna()
reuters = reuters.dropna()

In [None]:
print(len(cnbc) + len(guardian) + len(reuters))

In [None]:
import numpy as np
all_dates = np.unique(np.concatenate((cnbc['Time'].unique(), guardian['Time'].unique(), reuters['Time'].unique())))

In [None]:
combined_news = pd.concat([cnbc[['Time','Headlines']], guardian[['Time','Headlines']]], axis=0)
combined_news = pd.concat([combined_news[['Time','Headlines']], reuters[['Time','Headlines']]], axis=0)

In [None]:
combined_news.sort_values('Time')
combined_news = combined_news.rename(columns={'Time':'date','Headlines':'headlines'})

In [None]:
for i in range(len(combined_news)):
    combined_news['headlines'].iloc[i] = clean_text(combined_news['headlines'].iloc[i])

In [None]:
combined_news.to_csv('data/raw_data/combined_headlines.csv', index=False)

In [None]:
import yfinance as yf
from datetime import date, timedelta

def get_stock_data(ticker = 'AAPL', start_date = (date.today() - timedelta(days=365)).strftime('%Y-%m-%d'), end_date = date.today().strftime('%Y-%m-%d')):
    # Load the stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    return data

In [None]:
prediction_period = 1
start_date = pd.to_datetime(combined_news['date'].iloc[-1]).strftime('%Y-%m-%d')
end_date = (pd.to_datetime(combined_news['date'].iloc[0]) + timedelta(days=14)).strftime('%Y-%m-%d')
sp500 = get_stock_data('SPY',start_date=start_date, end_date=end_date)

In [None]:
sp500.to_csv('data/raw_data/spy_data.csv')

In [None]:
spy = pd.read_csv('data/raw_data/spy_data.csv')[['Date','Close']]

In [None]:
spy.columns

In [None]:
import numpy as np
combined_news['spy_close'] = np.nan
combined_news['spy_future_close'] = np.nan

In [None]:
for i in range(prediction_period,len(spy)):
    combined_news['spy_close'][combined_news['date']==spy['Date'].iloc[i-prediction_period]] = spy['Close'].iloc[i-prediction_period]
    combined_news['spy_future_close'][combined_news['date']==spy['Date'].iloc[i-prediction_period]] = spy['Close'].iloc[i]

In [None]:
combined_news['price_movement'] = combined_news['spy_future_close'] - combined_news['spy_close']

In [None]:
combined_news['move_percent'] = 100 * combined_news['price_movement'] / combined_news['spy_close']

In [None]:
for i in range(len(combined_news)-1):
    if pd.isna(combined_news['spy_close'].iloc[i]):
        combined_news['headlines'].iloc[i+1] = combined_news['headlines'].iloc[i] + combined_news['headlines'].iloc[i+1]

In [None]:
combined_news = combined_news.dropna()

In [None]:
combined_news.to_csv('data/1day_news_with_spy.csv', index=False)