In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.chunk import RegexpParser
import pandas as pd

In [8]:
data_financial=pd.read_csv('../financial_data.csv')

In [9]:
text = data_financial['text'][2]

'''part-of-speech (POS) tagging to identify the grammatical roles of each word'''

tokens = word_tokenize(text)
tagged = pos_tag(tokens)

In [10]:
'''Identify noun phrases in the text'''

chunk_grammar = r"""
  NP: {<DT>?<JJ>*<NN.*>+}   # chunk noun phrases
"""
cp = RegexpParser(chunk_grammar)
chunked = cp.parse(tagged)
    
'''exclude prepositions and conjunctions from noun phrases'''

chink_grammar = r"""
  NP:
    {<.*>+}                # match any word
    }<IN|CC>+{             # chink prepositions and conjunctions
"""
cp = RegexpParser(chink_grammar)
chinked = cp.parse(chunked)\


chunked.draw()
chinked.draw()

## Topic chuncking

In [25]:
import nltk
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rimai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    return tokenized_sentences

In [27]:
def perform_lda(tokenized_sentences, num_topics=5, num_passes=10):
    dictionary = Dictionary(tokenized_sentences)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_sentences]

    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=num_passes,
        random_state=42
    )

    return lda_model

In [28]:
def assign_topics(lda_model, tokenized_sentences):
    topics_per_sentence = []

    for sent_tokens in tokenized_sentences:
        bow_vector = lda_model.id2word.doc2bow(sent_tokens)
        topic_probs = lda_model.get_document_topics(bow_vector)
        topics = [topic for topic, prob in topic_probs]
        topics_per_sentence.append(topics)

    return topics_per_sentence

In [29]:
def topic_based_chunking(tokenized_sentences, topics_per_sentence):
    chunks = []
    current_chunk = []
    prev_topics = []

    for sent_tokens, topics in zip(tokenized_sentences, topics_per_sentence):
        if not prev_topics:
            current_chunk.append(sent_tokens)
            prev_topics = topics
        elif set(prev_topics) == set(topics):
            current_chunk.append(sent_tokens)
            prev_topics = topics
        else:
            chunks.append(current_chunk)
            current_chunk = [sent_tokens]
            prev_topics = topics

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [30]:
with open('../extracted_data/01 01 2023_Goldman Sachs_Caution Heavy Fog.txt', 'r',encoding='utf-8') as file:
    text= file.read()

In [31]:
tokenized_sentences = preprocess_text(text)

In [32]:
len(tokenized_sentences)

2344

In [33]:

lda_model = perform_lda(tokenized_sentences, num_topics=5, num_passes=10)
topics_per_sentence = assign_topics(lda_model, tokenized_sentences)
chunks = topic_based_chunking(tokenized_sentences, topics_per_sentence)
for i, chunk in enumerate(chunks):
    ch=(" ".join(sent_tokens) for sent_tokens in chunk)

## Sentence chunking

In [40]:
def sentence_based_chunking(text, max_chunk_words=200):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_chunk_word_count = 0

    for sent in sentences:
        words = nltk.word_tokenize(sent)
        sent_word_count = len(words)

        if current_chunk_word_count + sent_word_count <= max_chunk_words:
            current_chunk.append(sent)
            current_chunk_word_count += sent_word_count
        else:
            chunks.append(current_chunk)
            current_chunk = [sent]
            current_chunk_word_count = sent_word_count

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [41]:
max_chunk_words = 200
chunks = sentence_based_chunking(text, max_chunk_words)
print(type(chunks))

<class 'list'>


In [42]:
chunks[4]

['The maximum \ndrawdown in the price of 10-year US Treasuries was 22%.',
 'It seldom happens \nthat 10-year Treasuries—which have half the volatility of US equities—drop \nnearly as much as these equities.',
 'The maximum drawdown of 10-year German bunds was 21%, and that of \n10-year UK gilts, 26%.',
 'This drop in the price of 10-year UK gilts stood in sharp \ncontrast to the 5% total return of UK equities, as measured by the FTSE 100.',
 'The latter’s positive return was driven primarily by the energy sector and the \nmetals and mining sector.',
 'Shorter-term bond benchmarks declined less but declined nonetheless: the \nmaximum drop was 11% for 1- to 10- year US Treasuries, 12% for 1- to 10-year \nGerman bunds and 15% for UK gilts.',
 'Oil and natural gas were thrown into turmoil with the invasion of Ukraine, \nand energy security has been thrust back into the forefront after a long hiatus.']

In [24]:
separator=" "
separator.join(chunks[2])

'A recession in the \nworld’s largest economy would reverberate globally. The US Congress may repeat \nthe mistakes of 2011 in failing to raise the debt limit in a timely and orderly \nmanner. China’s disorderly abandonment of its “zero-COVID” policy may \nunleash another wave of COVID-19 infections, including new variants, globally. The geopolitical outlook for 2023, too, is foggy and fraught with risk. There is \nno face-saving off-ramp for Russia from Ukraine. China is unlikely to reverse its \nassertive and aggressive posture. North Korea is expected to continue, even step \nup, its ballistic missile tests. Iran may proceed to enrich its uranium to weapons-\ngrade levels, which could elicit a military response by Israel. We proceed with caution. We start with a careful review of the turmoil in financial markets last year \nbecause this backdrop is important in understanding the fog of uncertainty still \nfacing investors. 2\nGoldman Sachs\njanuary 2023\n\n \nAmong financial assets,

In [37]:
import nltk
nltk.download('punkt')
from gensim.models import Word2Vec

# Sample dataset
dataset = [
    "This is the first sentence.",
    "And here's the second phrase.",
    "Finally, the third sentence."
]

# Tokenize the dataset into words
tokenized_dataset = [nltk.word_tokenize(sentence.lower()) for sentence in dataset]

# Train the Word2Vec model
model = Word2Vec(tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

# Access the word embeddings
word_embeddings = model.wv

# Get the vector representation of a specific word
word_vector = word_embeddings['sentence']

# Get similar words to a specific word
similar_words = word_embeddings.most_similar('sentence')

print("Word Vector for 'sentence':")
print(word_vector)

print("\nSimilar words to 'sentence':")
print(similar_words)


Word Vector for 'sentence':
[ 9.5566749e-05  3.0766914e-03 -6.8137394e-03 -1.3765570e-03
  7.6686372e-03  7.3468201e-03 -3.6746713e-03  2.6434178e-03
 -8.3172321e-03  6.2056510e-03 -4.6381601e-03 -3.1638264e-03
  9.3112551e-03  8.7383972e-04  7.4904407e-03 -6.0747396e-03
  5.1616658e-03  9.9239927e-03 -8.4574483e-03 -5.1358812e-03
 -7.0656319e-03 -4.8641083e-03 -3.7799017e-03 -8.5363444e-03
  7.9560187e-03 -4.8446450e-03  8.4246909e-03  5.2640396e-03
 -6.5507693e-03  3.9591030e-03  5.4700258e-03 -7.4266563e-03
 -7.4070138e-03 -2.4764915e-03 -8.6265476e-03 -1.5821959e-03
 -4.0328439e-04  3.2992417e-03  1.4416119e-03 -8.8057260e-04
 -5.5943532e-03  1.7301576e-03 -8.9744868e-04  6.7944760e-03
  3.9740861e-03  4.5300853e-03  1.4343826e-03 -2.7005500e-03
 -4.3667960e-03 -1.0317774e-03  1.4371726e-03 -2.6462940e-03
 -7.0744529e-03 -7.8055691e-03 -9.1231214e-03 -5.9355209e-03
 -1.8478223e-03 -4.3243608e-03 -6.4619179e-03 -3.7169741e-03
  4.2902827e-03 -3.7389426e-03  8.3780112e-03  1.5343077e

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rimai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
