In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.tokenize import word_tokenize
import string
import nltk
import gensim.downloader as api
from gensim.models import Word2Vec

# Load the JSON file and extract text data
with open('consolidated_data.json', 'r') as file:
    data = json.load(file)

# Define financial terms for filtering
seed_words = ['financial', 'bank', 'economy', 'market', 'investment', 'revenue', 'profit', 'capital', 'asset', 'income', 'equity', 'debt', 'loan', 'stock', 'bond', 'dividend']

# Download NLTK stopwords data
nltk.download('punkt')

# Load pre-trained Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Train or fine-tune word embeddings specifically on financial text data
financial_sentences = [word_tokenize(text.lower()) for text in data]
financial_word2vec_model = Word2Vec(financial_sentences, vector_size=300, window=5, min_count=1)

# Define a function to filter text data based on financial terms
def filter_financial_text(text, threshold):
    tokens = word_tokenize(text.lower())  # Tokenize text and convert to lowercase
    # Calculate average cosine similarity between tokens and seed words using domain-specific word embeddings
    avg_cosine_similarity = sum(financial_word2vec_model.wv.similarity(token, seed_word) for token in tokens for seed_word in seed_words) / len(tokens)
    return avg_cosine_similarity > threshold  # Filter based on a threshold

# Define a function to preprocess the text data
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = [token for token in word_tokenize(text) if 'txt' not in token]  # Exclude tokens containing 'txt'
    return ' '.join(tokens)  # Join tokens into a single string

# Preprocess the text data
preprocessed_texts = [preprocess_text(text) for text in data]

# Vectorize the preprocessed text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)

# Apply Non-Negative Matrix Factorization (NMF) using TF-IDF vectorized data
num_topics = 5  # Specify the number of topics
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(X_tfidf)

# Display the topics generated by NMF
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# Display the topics
num_top_words = 10  # Specify the number of top words to display for each topic
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
display_topics(nmf_model, feature_names_tfidf, num_top_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Topic 1:
general caseys us dollar delek foods air huntington products national
Topic 2:
american axle republic huntington td old ingalls data home fidelity
Topic 3:
united states us foods delek dollar air products alaska westinghouse
Topic 4:
financial southern citizens securian western pnc services for thrivent us
Topic 5:
of bank packaging corp life group reinsurance intl expeditors labratory
