In [None]:
from google.colab import drive
drive.mount('/content/drive')

Ceaning Dataset

In [None]:
import os
import pandas as pd
from datetime import datetime

In [None]:
base_path = '/content/drive/MyDrive/Final_Project/ACL19_Release'

In [None]:
# Remove Audio Files from earnings call folders

for company_folder in os.listdir(base_path):
    full_path = os.path.join(base_path, company_folder)
    ceo_audio_folder = os.path.join(full_path, "CEO")

    if os.path.isdir(ceo_audio_folder):
        for file in os.listdir(ceo_audio_folder):
            if file.endswith(".mp3"):
                try:
                    os.remove(os.path.join(ceo_audio_folder, file))
                    print(f"Deleted: {file}")
                except Exception as e:
                    print(f"Error deleting {file}: {e}")

In [None]:
#Separate company from date

rows = []

for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)

    if os.path.isdir(folder_path):
        try:
            # Split into company and date
            company_name, date_str = folder.rsplit('_', 1)
            earnings_date = datetime.strptime(date_str, '%Y%m%d').date()

            # Read TextSequence.txt
            text_file = os.path.join(folder_path, "TextSequence.txt")
            if os.path.isfile(text_file):
                with open(text_file, 'r', encoding='utf-8') as f:
                    text = f.read().strip()
                    rows.append({
                        'Company_Name': company_name,
                        'Earnings_Call_Date': earnings_date,
                        'Text': text
                    })
        except Exception as e:
            print(f"Error processing {folder}: {e}")

# Save results to CSV
df = pd.DataFrame(rows)
df.to_csv("earnings_calls_cleaned.csv", index=False)

In [None]:
df.to_csv("/content/drive/MyDrive/Final_Project/earnings_calls_cleaned.csv", index=False)

In [None]:
earnings_call_dataset = pd.read_csv('/content/drive/MyDrive/Final_Project/earnings_calls_cleaned.csv')

In [None]:
earnings_call_dataset.head()

In [None]:
earnings_call_dataset.info()

Adding Company Codes and Industries to Dataset

In [None]:
!pip install fuzzywuzzy[speedup] pandas

In [None]:
from fuzzywuzzy import fuzz

fuzz.ratio('A.O. Smith Corp', 'A.O. Smith Corporation Common Stock')

In [None]:
nasdaq_df = pd.read_csv('/content/drive/MyDrive/Final_Project/nasdaq_screener_1746290920721.csv')

# Clean the Name column
nasdaq_df['Name'] = nasdaq_df['Name'].str.replace('Common Stock', '', regex=False).str.strip()

nasdaq_df = nasdaq_df.drop(columns=['Last Sale', 'Net Change', '% Change', 'Market Cap', 'IPO Year', 'Volume'])

nasdaq_df.head(10)

In [None]:
nasdaq_df.to_csv("/content/drive/MyDrive/Final_Project/nasdaq.csv", index=False)

In [None]:
earnings_df = pd.read_csv('/content/drive/MyDrive/Final_Project/earnings_calls_cleaned.csv')
reference_df = pd.read_csv('/content/drive/MyDrive/Final_Project/nasdaq.csv')

In [None]:
from fuzzywuzzy import process

# Clean company names
earnings_df['Company_Name_cleaned'] = earnings_df['Company_Name'].str.lower().str.strip()
reference_df['Name_cleaned'] = reference_df['Name'].str.lower().str.strip()

# Reference list of cleaned names
reference_names = reference_df['Name_cleaned'].tolist()

# Create match columns
matched_names = []
match_scores = []

# Fuzzy match each company in earnings_df
for name in earnings_df['Company_Name_cleaned']:
    match, score = process.extractOne(name, reference_names)
    if score >= 85:  # Strict threshold to prevent false positives
        matched_names.append(match)
        match_scores.append(score)
    else:
        matched_names.append(None)
        match_scores.append(None)

# Add match info to earnings_df
earnings_df['Matched_Name'] = matched_names
earnings_df['Match_Score'] = match_scores

# Merge only on high-confidence matches
merged_df = earnings_df.merge(
    reference_df,
    left_on='Matched_Name',
    right_on='Name_cleaned',
    how='left'
)

# Final selection
final_df = merged_df[[
    'Company_Name', 'Earnings_Call_Date', 'Text',
    'Symbol', 'Country', 'Sector', 'Industry', 'Match_Score'
]].rename(columns={'Symbol': 'Stock_Ticker'})


final_df.to_csv('/content/drive/MyDrive/Final_Project/enriched_earnings_calls.csv', index=False)

In [None]:
final_df

Not receiving great results with fuzzy, attemptin TF-IDF vector similarities

In [None]:
# Clean company name columns
import re

def clean_name(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9 ]', '', name)         # remove punctuation
    name = re.sub(r'\b(inc|co|corp|ltd|plc|llc)\b', '', name)  # remove suffixes
    return name.strip()

earnings_df['Company_Name_cleaned'] = earnings_df['Company_Name'].apply(clean_name)
reference_df['Name_cleaned'] = reference_df['Name'].apply(clean_name)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fit TF-IDF on both sets of names
vectorizer = TfidfVectorizer().fit(earnings_df['Company_Name_cleaned'].tolist() + reference_df['Name_cleaned'].tolist())

earnings_vecs = vectorizer.transform(earnings_df['Company_Name_cleaned'])
reference_vecs = vectorizer.transform(reference_df['Name_cleaned'])

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(earnings_vecs, reference_vecs)
best_match_idx = np.argmax(similarity_matrix, axis=1)
best_scores = similarity_matrix[np.arange(len(earnings_df)), best_match_idx]

# Match only if similarity score is high enough
matched_names = []
for i, score in enumerate(best_scores):
    if score > 0.85:  # tighten this threshold
        matched_names.append(reference_df.iloc[best_match_idx[i]]['Name_cleaned'])
    else:
        matched_names.append(None)

earnings_df['Matched_Name'] = matched_names

In [None]:
merged_df = earnings_df.merge(reference_df, left_on='Matched_Name', right_on='Name_cleaned', how='left')

merged_df.info()

In [None]:
df_view = merged_df.drop(columns=['Text'])

df_view

In [None]:
merged_df = merged_df.drop(columns=['Company_Name_cleaned', 'Matched_Name', 'Name', 'Name_cleaned'])

In [None]:
merged_df = merged_df.rename(columns={'Symbol': 'Stock_Ticker'})

In [None]:
merged_df = merged_df.dropna()

merged_df

In [None]:
merged_df.to_csv('/content/drive/MyDrive/Final_Project/enriched_earnings_calls.csv', index=False)

In [None]:
unique_count = merged_df['Company_Name'].nunique()
print(f"Number of unique companies: {unique_count}")

In [None]:
company_counts = merged_df['Company_Name'].value_counts()
print(company_counts)

In [None]:
company_counts_df = company_counts.reset_index()
company_counts_df.columns = ['Company_Name', 'Row_Count']
company_counts_df

In [None]:
valid_companies = company_counts[company_counts >= 2].index

filtered_df = merged_df[merged_df['Company_Name'].isin(valid_companies)]

filtered_df

In [None]:
import matplotlib.pyplot as plt

# Count unique companies by sector
companies_by_sector = merged_df.groupby('Sector')['Company_Name'].nunique().sort_values(ascending=False)

# Plot
plt.figure(figsize=(12, 6))
companies_by_sector.plot(kind='bar')
plt.title('Number of Unique Companies by Sector')
plt.xlabel('Sector')
plt.ylabel('Unique Company Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Retreive Stock Data

In [None]:
!pip install yfinance
!pip install yfinance tqdm

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/Final_Project/enriched_earnings_calls.csv')

merged_df

In [None]:
import yfinance as yf
ticker = yf.Ticker("ABBV")
hist = ticker.history(start="2017-07-20", end="2017-08-01")
print(hist)

In [None]:
from datetime import timedelta
import yfinance as yf
import pandas as pd
import time

records = []

for idx, row in merged_df.iterrows():
    ticker = row['Stock_Ticker']
    call_date = pd.to_datetime(row['Earnings_Call_Date'])

    if pd.notnull(ticker) and pd.notnull(call_date):
        try:
            start = call_date - timedelta(days=5)
            end = call_date + timedelta(days=5)
            data = yf.download(ticker, start=start, end=end, progress=False)

            if not data.empty:
                for date, row_data in data.iterrows():
                    records.append({
                        'Ticker': ticker,
                        'Call_Date': call_date.date(),
                        'Price_Date': date.date(),
                        'Close': row_data['Close']
                    })
            time.sleep(1)
        except Exception as e:
            print(f"[{ticker} on {call_date}] Failed: {e}")

In [None]:
price_df = pd.DataFrame(records)

In [None]:
import re

def use_regex(input_text):
    input_text = str(input_text)  # ensure it's a string
    pattern = re.compile(r"[0-9]*\.[0-9]+")
    match = pattern.search(input_text)
    if match:
        return float(match.group(0))
    else:
        return None

In [None]:
price_df['Close_Clean'] = price_df['Close'].apply(use_regex)

In [None]:
price_df.drop(columns=['Close'], inplace=True)

In [None]:
price_df.to_csv('/content/drive/MyDrive/Final_Project/stock_price_data.csv', index=False)

In [None]:
price_df = pd.read_csv('/content/drive/MyDrive/Final_Project/stock_price_data.csv')

In [None]:
price_df.head()

In [None]:
import pandas as pd

# Convert dates
price_df['Call_Date'] = pd.to_datetime(price_df['Call_Date'])
price_df['Price_Date'] = pd.to_datetime(price_df['Price_Date'])

metrics = []

# Group by ticker and call date
grouped = price_df.groupby(['Ticker', 'Call_Date'])

for (ticker, call_date), group in grouped:
    group = group.sort_values('Price_Date')

    # Pre- and post-call windows
    pre_call = group[group['Price_Date'] < call_date]
    post_call_inclusive = group[group['Price_Date'] >= call_date]

    price_change_1d = None
    volatility_3d = None
    avg_change = None

    # 1-day price change
    if not pre_call.empty and len(post_call_inclusive) >= 2:
        try:
            pre_close = pre_call.iloc[-1]['Close_Clean']
            post_close = post_call_inclusive.iloc[1]['Close_Clean']
            price_change_1d = (post_close - pre_close) / pre_close
        except:
            pass

    # Volatility (first 3 trading days including call)
    try:
        post_returns = post_call_inclusive['Close_Clean'].pct_change().dropna()
        volatility_3d = post_returns.head(3).std()
    except:
        pass

    # Average price change (3 pre vs 3 post incl. call)
    try:
        avg_pre_prices = pre_call['Close_Clean'].tail(3)
        avg_post_prices = post_call_inclusive['Close_Clean'].head(3)

        if len(avg_pre_prices) >= 2 and len(avg_post_prices) >= 2:
            avg_pre = avg_pre_prices.mean()
            avg_post = avg_post_prices.mean()
            avg_change = (avg_post - avg_pre) / avg_pre
    except:
        pass

    metrics.append({
        'Stock_Ticker': ticker,
        'Earnings_Call_Date': call_date,
        'Price_Change_1d': price_change_1d,
        'Volatility_2d': volatility_3d,
        'Avg_Price_Change': avg_change
    })

price_metrics_df = pd.DataFrame(metrics)

In [None]:
price_metrics_df.info()

Preprocessing Earnings Calls

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, names
import string

# One-time downloads
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('names')

# Initialize components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#Remove names
name_list = set([name.lower() for name in names.words()])

# Add custom stopwords
custom_stopwords = set(['quarter', 'guidance', 'call', 'fiscal', 'update', 'thank', 'thanks', 'questions', 'indiscernible', 'inaudible', 'okay', 'today', 'year', 'wa', 'think', 'million', 'billion', 'one', 'two', 'three', 'well', 'would', 'weve', 'going', 'really'])
stop_words.update(custom_stopwords)

# Acronym mapping
acronym_map = {
    'eps': 'earnings_per_share',
    'gaap': 'accounting_standard',
    'ebit': 'operating_income'
}

def my_preprocessor(text):
    # Lowercase and remove known tags
    text = text.lower()
    text = re.sub(r'\[.*?technical difficulty.*?\]', '', text, flags=re.IGNORECASE)

    # Remove numbers, punctuation, and special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Replace acronyms
    tokens = [acronym_map.get(token, token) for token in tokens]

    # Remove short workds
    tokens = [token for token in tokens if len(token) > 2]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Remove common names
    tokens = [token for token in tokens if token not in name_list]

    # Return clean string
    return ' '.join(tokens)

In [None]:
merged_df.to_csv('/content/drive/MyDrive/Final_Project/enriched_earnings_calls_processed.csv', index=False)

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/Final_Project/enriched_earnings_calls_processed.csv')

In [None]:
merged_df

Topic Modelling

Attempting LDA First

In [None]:
!pip uninstall -y smart_open
!pip install --upgrade smart_open
!pip install --upgrade gensim

In [None]:
!pip install --upgrade gensim
!pip install smart_open==5.2.1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Step 1: TF-IDF vectorization
vectorizer = TfidfVectorizer(preprocessor=my_preprocessor, max_features=5000)
count_matrix = vectorizer.fit_transform(merged_df['Text'])

# Step 2: Try different topic counts
topic_models = {}
for num_topics in [3, 4, 5, 6]:
    print(f"Training LDA with {num_topics} topics...")

    lda_model = LatentDirichletAllocation(
        n_components=num_topics,
        learning_method='online',
        random_state=42,
        max_iter=10
    )
    lda_model.fit(count_matrix)

    # Store the model
    topic_models[num_topics] = lda_model

    # Evaluate using log perplexity
    perplexity = lda_model.perplexity(count_matrix)
    print(f"Perplexity for {num_topics} topics: {perplexity:.2f}")

    # Print top words for each topic
    feature_names = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {idx}: {', '.join(top_words)}")
    print("\n" + "-"*60 + "\n")

In [None]:
best_num_topics = 4  # Higher perplexity score than 3, but slightly more diversity in topics

lda_model = LatentDirichletAllocation(n_components=best_num_topics,
                                           learning_method='online',
                                           random_state=42,
                                           max_iter=10)

# Fit the model with count matrix
lda_best = lda_model.fit_transform(count_matrix)

In [None]:
!pip install pyLDAvis
!pip install --upgrade numpy

In [None]:
import pyLDAvis
import pyLDAvis.lda_model as sklearnvis

pyLDAvis.enable_notebook()
vis_data = sklearnvis.prepare(lda_model, count_matrix, vectorizer)
pyLDAvis.display(vis_data)

Now seeing if SBERT improves results

In [None]:
!pip install --upgrade pip

# Reinstall numpy first
!pip install --force-reinstall numpy==1.24.4

# Then reinstall scipy to match numpy
!pip install --force-reinstall scipy==1.10.1

# Then reinstall BERTopic cleanly
!pip install --force-reinstall bertopic

# Also reinstall sentence-transformers just in case
!pip install --force-reinstall sentence-transformers

In [None]:
!pip install numpy==1.26.4 --upgrade --force-reinstall

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

docs = merged_df['Text']  # Avoid issues with NaNs

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings (with progress bar)
sbert_embeddings = sbert_model.encode(docs, show_progress_bar=True)

In [None]:
import scipy.cluster.hierarchy as sc
import matplotlib.pyplot as plt

sc.dendrogram(sc.linkage(sbert_embeddings, method='ward'))
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering

for k in range(3, 7):  # Try different cluster numbers
    my_clustering = AgglomerativeClustering(n_clusters=k, linkage='ward')
    SBERT_cluster_labels = my_clustering.fit_predict(sbert_embeddings)
    score = silhouette_score(sbert_embeddings, SBERT_cluster_labels)
    print(f"Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
from sklearn.cluster import KMeans

for k in range(3, 7):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    SBERT_cluster_labels_kmeans = kmeans.fit_predict(sbert_embeddings)
    score = silhouette_score(sbert_embeddings, SBERT_cluster_labels_kmeans)
    print(f"K-Means Clusters: {k}, Silhouette Score: {score:.4f}")

USE Model

In [None]:
import tensorflow_hub as hub

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

use_embeddings = use_model(docs)

sc.dendrogram(sc.linkage(use_embeddings, method='ward'))
plt.show()

In [None]:
for k in range(3, 7):  # Try different cluster numbers
    my_clustering = AgglomerativeClustering(n_clusters=k, linkage='ward')
    USE_cluster_labels = my_clustering.fit_predict(use_embeddings)
    score = silhouette_score(use_embeddings, USE_cluster_labels)
    print(f"Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
for k in range(3, 7):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    USE_cluster_labels_kmeans = kmeans.fit_predict(use_embeddings)
    score = silhouette_score(use_embeddings, USE_cluster_labels_kmeans)
    print(f"K-Means Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
kmean_best = KMeans(n_clusters=5, random_state=42)
best_cluster_labels = kmeans.fit_predict(use_embeddings)

Adding USE and LDA Topics to merged_df

In [None]:
merged_df['USE_Topic'] = best_cluster_labels

for i in range(lda_best.shape[1]):
    merged_df[f'LDA_Topic_{i}'] = lda_best[:, i]

merged_df

Sentiment Analysis

In [None]:
!pip install transformers
!pip install -U sentencepiece

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Load FinBERT (Prosus version fine-tuned on financial text)
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
def get_finbert_sentiment(text):
    try:
        inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
        with torch.no_grad():
            logits = model(**inputs).logits
        scores = softmax(logits.numpy()[0])
        return {
            'FinBERT_Positive': scores[0],
            'FinBERT_Neutral': scores[1],
            'FinBERT_Negative': scores[2],
            'FinBERT_Polarity': scores[0] - scores[2]  # Positive - Negative
        }
    except:
        return {
            'FinBERT_Positive': None,
            'FinBERT_Neutral': None,
            'FinBERT_Negative': None,
            'FinBERT_Polarity': None
        }

In [None]:
sentiment_scores = merged_df['Text'].apply(get_finbert_sentiment)
sentiment_df = pd.DataFrame(sentiment_scores.tolist())

# Merge back into merged_df
merged_df = pd.concat([merged_df, sentiment_df], axis=1)

Merge all data together now

In [None]:
# Ensure date format matches
merged_df['Earnings_Call_Date'] = pd.to_datetime(merged_df['Earnings_Call_Date'])

# Merge on Stock_Ticker and Earnings_Call_Date
merged_df = merged_df.merge(
    price_metrics_df,
    how='left',
    on=['Stock_Ticker', 'Earnings_Call_Date']
)

In [None]:
merged_df.info()

In [None]:
merged_df.to_csv('/content/drive/MyDrive/Final_Project/final_earnings_calls.csv', index=False)

Seeing how the USE Model predicts topics

In [None]:
industry_labels = pd.get_dummies(merged_df['Sector'])
merged_with_labels = pd.concat([merged_df, industry_labels], axis=1)

In [None]:
topic_to_label = {}

# Loop through each USE topic and assign the most frequent industry
for topic in merged_with_labels['USE_Topic'].unique():
    temp_df = merged_with_labels[merged_with_labels['USE_Topic'] == topic]
    most_common_label = temp_df[industry_labels.columns].sum().idxmax()
    topic_to_label[topic] = most_common_label

print("Assigned Topics to Industry:", topic_to_label)

In [None]:
merged_with_labels['Predicted_Industry'] = merged_with_labels['USE_Topic'].map(topic_to_label)

In [None]:
from sklearn.metrics import classification_report

true_labels = merged_with_labels[industry_labels.columns].idxmax(axis=1)
predicted_labels = merged_with_labels['Predicted_Industry']

print(classification_report(true_labels, predicted_labels))

Analysis

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/Final_Project/final_earnings_calls.csv')

In [None]:
merged_df[['FinBERT_Polarity', 'Price_Change_1d', 'Volatility_2d', 'Avg_Price_Change']].corr()

Test on Price Volatility

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

merged_df['High_Volatility'] = merged_df['Volatility_2d'] > merged_df['Volatility_2d'].median()

use_topic_dummies = pd.get_dummies(merged_df['USE_Topic'], prefix='Topic')


X = pd.concat([
    merged_df[['FinBERT_Positive', 'FinBERT_Neutral', 'FinBERT_Negative']],
    use_topic_dummies
], axis=1)
y = merged_df['High_Volatility'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

importances = clf.feature_importances_
feature_names = X.columns

indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importance (Random Forest)")
plt.bar(range(len(feature_names)), importances[indices])
plt.xticks(range(len(feature_names)), feature_names[indices], rotation=45, ha='right')
plt.tight_layout()
plt.show()

Test on AVG Price Change

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Features
X = pd.concat([
    merged_df[['FinBERT_Positive', 'FinBERT_Neutral', 'FinBERT_Negative']],
    pd.get_dummies(merged_df['USE_Topic'], prefix='Topic')
], axis=1)

# Target
y = merged_df['Avg_Price_Change']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train, y_train)

# Evaluate
y_pred = reg.predict(X_test)
print("R²:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

print("R²:", linreg.score(X_test, y_test))

Test on if price moves up or down

In [None]:
merged_df['Price_Up'] = merged_df['Price_Change_1d'].apply(
    lambda x: 1 if x > 0.01 else 0 if x < -0.01 else np.nan
)
merged_df = merged_df.dropna(subset=['Price_Up'])

In [None]:
use_topic_dummies = pd.get_dummies(merged_df['USE_Topic'], prefix='Topic')
X = pd.concat([
    merged_df[['FinBERT_Positive', 'FinBERT_Neutral', 'FinBERT_Negative']],
    use_topic_dummies
], axis=1)

y = merged_df['Price_Up'].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
importances = clf.feature_importances_
feature_names = X.columns

indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importance (Random Forest)")
plt.bar(range(len(feature_names)), importances[indices])
plt.xticks(range(len(feature_names)), feature_names[indices], rotation=45, ha='right')
plt.tight_layout()
plt.show()

Test using only sentiment scores

In [None]:
# Define features (sentiment only)
X = merged_df[['FinBERT_Positive', 'FinBERT_Neutral', 'FinBERT_Negative']]

# Define target (adjust for your use case — either Price_Up or High_Volatility)
y = merged_df['Price_Up'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
importances = clf.feature_importances_
feature_names = X.columns

indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importance (Random Forest)")
plt.bar(range(len(feature_names)), importances[indices])
plt.xticks(range(len(feature_names)), feature_names[indices], rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Define features (sentiment only)
X = merged_df[['FinBERT_Positive', 'FinBERT_Neutral', 'FinBERT_Negative']]

# Define target (adjust for your use case — either Price_Up or High_Volatility)
y = merged_df['High_Volatility'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
merged_df.head()