## Read in db file

In [1]:
import pandas as pd
import sqlite3

# Connect to db
conn = sqlite3.connect('reddit-sqlite.db')

query = "SELECT * FROM comments"
df = pd.read_sql_query(query, conn)
print(f"Comments df: {len(df)}")
df.to_excel("reddit_data_raw.xlsx")

query = "SELECT * FROM sentiment"
sent_df = pd.read_sql_query(query, conn)
print(f"Sentiment df: {len(sent_df)}")
sent_df.to_excel("sentiment_data.xlsx")

# Close connection
conn.close()


Comments df: 815
Sentiment df: 815


## Remove author column by copying data to new table, excluding comment author column

In [15]:
conn = sqlite3.connect('reddit-sqlite.db')
cursor = conn.cursor()

cursor.execute("ALTER TABLE comments DROP COLUMN comment_author")

conn.close()

## Topic modelling
Testing extracting 7 days of comments and modelling the topics. Converting ticker symbols to ful names aswell. 

In [19]:
from datetime import datetime, timedelta

# Filter past 7 days
df['comment_date'] = pd.to_datetime(df['comment_date'])

# Calculate the date 7 days ago from today
seven_days_ago = datetime.now() - timedelta(days=7)

# Filter the DataFrame for dates within the last 7 days
filtered_df = df[df['comment_date'] >= seven_days_ago]

mapping_df = pd.read_csv('firms.csv')
# Function to replace ticker symbols with full company names
def replace_tickers(comment):
    for index, row in mapping_df.iterrows():
        if isinstance(row['ticker'], str):  # Check if the ticker symbol is not empty
            comment = comment.replace(row['ticker'], row['name'])
        if isinstance(row['altticker'], str):  # Check if the alternate ticker symbol is not empty
            comment = comment.replace(row['altticker'], row['name'])
    return comment

# Apply the function to the 'comment' column
filtered_df['comment'] = filtered_df['comment'].apply(replace_tickers)
filtered_df = filtered_df['comment']

filtered_df.to_csv("seven_days_comments.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['comment'] = filtered_df['comment'].apply(replace_tickers)


## Topic Modelling

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectoriser = CountVectorizer(ngram_range=(1,2), stop_words="english")
model = BERTopic(
    vectorizer_model = vectoriser,
    language = 'english',
    calculate_probabilities = True,
    verbose = True
)

text = """



"""
topics, probs = model.fit_transform(text)

## Summarisation

In [15]:
condition1 = df['matched_phrase'] == 'JPMorgan'
condition2 = df['matched_phrase'] == 'jpm'
condition3 = df['matched_phrase'] == 'Jpm'
condition4 = df['matched_phrase'] == 'JPM'
condition5 = df['matched_phrase'] == 'jpmorgan'
condition6 = df['matched_phrase'] == 'JPMorgan'
condition7 = df['matched_phrase'] == 'JPMorgan chase'

filtered_df = df[condition1 | condition2 | condition3 | condition4 | condition5 | condition6 | condition7]

jpmorgan_comments = filtered_df['comment'].to_list()
print(len(jpmorgan_comments))
jpmorgan_comments = '. '.join(jpmorgan_comments)
jpmorgan_comments

122




In [17]:
from pathlib import Path

# Related third party imports
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch


model_path = Path.cwd().parent / 'models' / 'bart-cnn-large'
summariser = BartForConditionalGeneration.from_pretrained(model_path)
tokeniser = BartTokenizer.from_pretrained(model_path)

inputs = tokeniser([jpmorgan_comments], max_length=1024, return_tensors='pt', truncation=True)
summary_ids = summariser.generate(inputs['input_ids'], 
                                    max_length=20, 
                                    min_length=10,
                                    early_stopping=True,
                                    num_beams=4,
                                    temperature=1.0,
                                    repetition_penalty=5.0
                                    )
summary = tokeniser.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

JPM 0te puts are pretty cheap, just saying.. Inverse JPMorgan index?


In [19]:
from transformers import BartTokenizer, BartForSequenceClassification

tokenizer = BartTokenizer.from_pretrained(model_path)
classifier = BartForSequenceClassification.from_pretrained(model_path)
inputs = tokeniser([jpmorgan_comments], max_length=1024, return_tensors='pt', truncation=True)
labels = torch.tensor([1]).unsqueeze(0) 
outputs = classifier(inputs['input_ids'], labels=labels)
loss, logits = outputs[:2]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at c:\Users\333866\Documents\dev\models\bart-cnn-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
