# V1: Base Model

Methodology:
- Distilbert model (open source huggingface)
- SNP 5.3GB dataset (news) -> identified only top 15 tickers in weight
- Generate predictions for each Article title
- Group by day, and create a score (sum) for each day
- Optimise to find a threshold, if above buy if below sell.
- To test for buy sell amount

In [None]:
import pandas as pd

read_df = pd.read_csv('data/FNSPID dataset/snp 5.3gb.csv')

In [None]:
selected_symbols = ['MSFT', 'NVDA', 'AAPL', 'AMZN', 'META', 'AVGO', 'BRK.B', 'TSLA', 'GOOG', 'GOOGL', 'WMT', 'JPM', 'LLY', 'V', 'ORCL', 'NFLX', 'XOM', 'MA', 'COST', 'PG']

read_df = read_df[read_df['Stock_symbol'].isin(selected_symbols)]

In [None]:
df = pd.DataFrame({
    'Date': read_df['Date'],
    'Title': read_df['Article_title'],
    'StockSymbol': read_df['Stock_symbol']
})

df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    test_sentence = " ".join(cleaned_tokens)
    return test_sentence

In [None]:
df['Title'] = df['Title'].apply(preprocess)

In [None]:
from transformers import pipeline, set_seed
set_seed(999)

In [None]:
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
predictions = classifier(df['Title'].tolist())

df['Sentiment'] = [
    pred['score'] if pred['label'] == 'POSITIVE' else -pred['score'] if pred['label'] == 'NEGATIVE' else 0
    for pred in predictions
]

In [None]:
grouped_data = df.groupby('Date').agg({
    'Sentiment': 'sum'
})
grouped_data.index = pd.to_datetime(grouped_data.index)
grouped_data.to_csv('data/grouped_data/V1 grp_df.csv')

In [None]:
# import yfinance as yf
#
# # Download OHLCV data for SPY (adjust ticker as needed)
# price_data = yf.download('AAPL', start='2008-08-20', end='2011-06-30')
#
# if isinstance(price_data.columns, pd.MultiIndex):
#     price_data.columns = price_data.columns.droplevel(1)
# price_data.reset_index(inplace=True)  # Move Date to a column
# price_data['Date'] = pd.to_datetime(price_data['Date']).dt.strftime('%Y-%m-%d')
#
# # Save to CSV with clean format
# price_data.to_csv('data/Training Data/SNP 08-11.csv', index=False)

In [None]:
import pandas as pd
grouped_data = pd.read_csv('data/grouped_data/V1 grp_df.csv', index_col='Date')
grouped_data.index = pd.to_datetime(grouped_data.index)

In [None]:
# # optimise threshold
# cerebro.optstrategy(SentimentStrategy, sentiment_threshold=[0.0, 0.5, 1.0])
#
# # Set initial cash
# cerebro.broker.setcash(1000000.0)
#
# # Add commission (e.g., 0.1% per trade)
# cerebro.broker.setcommission(commission=0.001)
#
# # Run backtest
# results = cerebro.run()
#
# for strategy in results:
#     print('Sentiment Threshold:', strategy.params.sentiment_threshold)
#     print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())

In [69]:
import backtrader as bt

# Initialize Cerebro
cerebro = bt.Cerebro()

# Load price data
price_feed = bt.feeds.YahooFinanceCSVData(
    dataname='data/Training Data/SNP 08-11.csv',
    fromdate=pd.to_datetime('2009-08-07'),
    todate=pd.to_datetime('2010-06-10'),
    datetime=0,  # Date is first column
    open=1,
    high=2,
    low=3,
    close=4,
    volume=5,
    dateformat='%Y-%m-%d'
)
cerebro.adddata(price_feed, name='price')

# Load sentiment data
sentiment_feed = SentimentData(
    dataname=grouped_data,
    datetime=None,  # Use index as datetime
    sentiment='Sentiment',
    fromdate=pd.to_datetime('2009-08-07'),
    todate=pd.to_datetime('2010-06-10')
)
cerebro.adddata(sentiment_feed, name='sentiment')

class SentimentData(bt.feeds.PandasData):
    lines = ('sentiment',)
    params = (('sentiment', 'Sentiment'), ('datetime', None))

class SentimentStrategy(bt.Strategy):
    params = (
        ('size', 100),  # Number of shares to trade
        ('sentiment_threshold', 0.2),  # Threshold for sentiment signal
    )

    def __init__(self):
        self.sentiment = self.datas[1].sentiment  # Access sentiment line explicitly
        self.order = None
        self.last_date = None  # Track last processed date

    def next(self):
        date = self.datas[0].datetime.date(0)
        if self.last_date == date:
            return
        sentiment = self.sentiment[0]
        print(f"{date}: Sentiment={sentiment}, Close={self.datas[0].close[0]}, Position={self.position.size}, OrderPending={self.order is not None}")

        if self.order:
            self.close()
            print(f'{date}: Close')

        if sentiment > self.params.sentiment_threshold:
            self.order = self.buy(size=int(self.params.size))
            print(f"{date}: Buy {self.params.size} shares at {self.datas[0].close[0]}")
        elif sentiment < -self.params.sentiment_threshold:
            self.order = self.sell(size=int(self.params.size))
            print(f"{date}: Sell {self.params.size} shares at {self.datas[0].close[0]}")
        else:
            print(f"{date}: No trade (Sentiment={sentiment} within threshold)")

        self.last_date = date  # Update last processed date

class DebugStrategy(bt.Strategy):
    params = (
        ('size', 100),  # Number of shares to trade
        ('sentiment_threshold', 0.5),  # Threshold for sentiment signal
    )

    def __init__(self):
        self.sentiment = self.datas[1].sentiment  # Access sentiment line explicitly
        self.order = None
        self.last_date = None  # Track last processed date

    def next(self):
        date = self.datas[0].datetime.date(0)
        if self.last_date == date:
            return
        sentiment = self.sentiment[0]
        print(f'{date}: {sentiment}, {self.last_date}')
        self.last_date = date

In [73]:
# Add strategy
cerebro.addstrategy(SentimentStrategy, size=1, sentiment_threshold=0.5)

initial_cash = 100000
commission = 0.001

# Set initial cash
cerebro.broker.setcash(initial_cash)

# Add commission (e.g., 0.1% per trade)
cerebro.broker.setcommission(commission=commission)

# Add analyzers
cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe')
cerebro.addanalyzer(bt.analyzers.Returns, _name='returns')
cerebro.addanalyzer(bt.analyzers.TradeAnalyzer, _name='trades')

# Run backtest
results = cerebro.run()
strategy = results[0]

# Print results
print(strategy.analyzers.sharpe.get_analysis())
print(strategy.analyzers.returns.get_analysis())
print(f'Initial Portfolio Value: %.2f {initial_cash}, commission: {commission}')
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
print('Sharpe Ratio:', strategy.analyzers.sharpe.get_analysis()['sharperatio'])
print('Total Return:', strategy.analyzers.returns.get_analysis()['rtot'])
trade_analysis = strategy.analyzers.trades.get_analysis()
print('Total Trades:', trade_analysis['total']['total'])

# Plot results
# cerebro.plot()

2009-08-07: Sentiment=4.878258645534515, Close=387354800.0, Position=0, OrderPending=False
2009-08-07: Buy 1 shares at 387354800.0
2009-08-07: Sentiment=4.878258645534515, Close=387354800.0, Position=0, OrderPending=False
2009-08-07: Buy 1 shares at 387354800.0
2009-08-07: Sentiment=4.878258645534515, Close=387354800.0, Position=0, OrderPending=False
2009-08-07: Buy 1 shares at 387354800.0
2009-08-07: Sentiment=4.878258645534515, Close=387354800.0, Position=0, OrderPending=False
2009-08-07: Buy 1 shares at 387354800.0
2009-08-10: Sentiment=-1.0101189613342283, Close=300294400.0, Position=0, OrderPending=True
2009-08-10: Close
2009-08-10: Sell 1 shares at 300294400.0
2009-08-10: Sentiment=-1.0101189613342283, Close=300294400.0, Position=0, OrderPending=True
2009-08-10: Close
2009-08-10: Sell 1 shares at 300294400.0
2009-08-10: Sentiment=-1.0101189613342283, Close=300294400.0, Position=0, OrderPending=True
2009-08-10: Close
2009-08-10: Sell 1 shares at 300294400.0
2009-08-10: Sentiment=-