In [1]:
import os
import sys
from IPython.display import HTML, display

import numpy as np
import pandas as pd
import tensorflow as tf
from math import ceil
from scipy.spatial.distance import cosine

import matplotlib.pyplot as plt
import seaborn as sns

import collections
import random
import time
import string
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, Dense, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dropout, LSTM, Bidirectional, Conv1D, Concatenate
import matplotlib.pyplot as plt

2025-04-15 10:28:51.454559: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jonathonlopes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jonathonlopes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jonathonlopes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Set up directories
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

DIRECTORY = '.'
SUMMARY_PATH = 'NLP_app/MutualFundSummary'
SUMMARY_LABELS_PATH = 'NLP_app/MutualFundLabels.csv'

glove_word2vec = 'NLP_app/word2vec_perso.txt'
our_word2vec = 'NLP_app/word2vec_perso.txt'

# Utility functions
 

In [3]:
# Progress bar
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

# Save a word2vec dictionary.
def save_word2vec(filename, word2vec):
    with open(os.path.join('./', filename),'a' , encoding='utf-8') as f :
        for k, v in word2vec.items():
            line = k+' '+str(list(v)).strip('[]').replace(',','')+'\n'
            f.write(line)

# Load a word2vec dictionary.
def load_word2vec(filename):
    word2vec = {}
    with open(os.path.join('./', filename), encoding='utf8') as f:
        for line in f:
            try :
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
            except :
                None
    return word2vec

# read the repo in PATH and append the texts in a list
def get_data(PATH):
    list_dir = os.listdir(PATH)
    texts = []
    fund_names = []
    out = display(progress(0, len(list_dir)-1), display_id=True)
    
    # Uses only funds in rag_preds, which are funds with the 3 relevant strategies
    rag_preds = pd.read_csv("rag_predictions.csv")
    relevant_funds = list(rag_preds['fund_name'])
    
    for ii, filename in enumerate(list_dir) :
        with open(PATH+'/'+filename, 'r', encoding="utf8") as f :
            txt = f.read()
            try :
                txt_split = txt.split('<head_breaker>')
                summary = txt_split[1].strip()
                fund_name = txt_split[0].strip()
            except :
                summary = txt
                fund_name = ''
        if fund_name in relevant_funds:
            texts.append(summary)
            fund_names.append(fund_name)
        out.update(progress(ii, len(list_dir)-1))
    return fund_names, texts

## Tokenize fund summaries

In [4]:
stop_words = set(stopwords.words("english")+list(string.punctuation)+['``',"''"]+["]","[","*"]+['doe', 'ha', 'wa'])

In [5]:
# Get the summaries that use one of the 3 relevant investment strategies
fund_names, summaries = get_data(SUMMARY_PATH)

In [6]:
# Clean and tokenize text - remove whitespace, tokenize words, keep only alphabetic chars, remove stopwords
def tokenizer(txt: str) -> list[str]:
    txt = txt.replace('\n', ' ').replace('\t', ' ').lower()
    word_tokens = word_tokenize(txt)
    filtered_sentence = []
    for w in word_tokens:
        w_clean = re.sub(r'[^a-zA-Z]', '', w)
        if w_clean and w_clean not in stop_words:
            filtered_sentence.append(w_clean)
    
    return filtered_sentence

In [7]:
# Make one list of all tokenized words from each summary
text_words = np.concatenate([tokenizer(summary) for summary in summaries])

## Set up for skip-gram model

In [8]:
# Training Parameters
batch_size = 128 # The model will be trained batch per batch and one batch contains 128 rows
num_epochs = 2 # The model will go through all the data twice

In [9]:
# Word2Vec Parameters
embedding_size = 50 # Dimension of the embedding vector, also size of hidden layer
max_vocabulary_size = 5000 # Total number of different words in the vocabulary
min_occurrence = 10 # Remove all words that does not appears at least n times
skip_window = 3 # How many words to consider left and right
num_skips = 4 # How many times to reuse an input to generate a label

In [10]:
# Makes dictionary of words to word frequency for frequency > min_occurrence
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
# Remove samples with less than 'min_occurrence' occurrences
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached
        break

In [11]:
# Create id's for each word and create 2 dictionary to access in both directions
word2id = dict()
for i, (word, _)in enumerate(count):
    word2id[word] = i
id2word = dict(zip(word2id.values(), word2id.keys()))
vocab_size = len(id2word)   

In [12]:
# Create data - list of text_words where words are converted to their id's
data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)

In [13]:
# Build OneHot vector from index
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [14]:
# Generate training batch for the skip-gram model
def batch_generator(batch_size, num_skips, skip_window, vocab_size):
    data_index = 0
    while True :
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        # batch is filled with 128 inputs
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        # labels is filled with 128 outputs 
        labels = np.ndarray(shape=(batch_size), dtype=np.int32)
        span = 2 * skip_window + 1
        # buffer keep track of the visited indexes visited
        buffer = collections.deque(maxlen=span)
        if data_index + span > len(data):
            data_index = 0
            # We stop the loop when we went through all the corpus
            break
        buffer.extend(data[data_index:data_index + span])
        data_index += span
        for i in range(batch_size // num_skips):  
            # Take the context current word
            context_words = [w for w in range(span) if w != skip_window]
            # Randomly select num_skips words in the context
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                # Creates one raw data
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index + len(data) - span) % len(data)

        # translate word index to on-hot representation
        batch_one_hot = np.array([to_one_hot(b, vocab_size) for b in batch])
        labels_one_hot = np.array([to_one_hot(l, vocab_size) for l in labels])

        # output one batch
        yield batch_one_hot, labels_one_hot

## Train the skip-gram model

In [15]:
# Create and compile the Autoencoder
def creat_word2vec_model():
    # One-hot encoded vector input vector
    input_word = Input(shape=(vocab_size,))
    # Embedded layer - maps input vector to embedding vector
    encoded = Dense(embedding_size, activation='linear')(input_word)
    # Output layer - maps embedding vector to probability of a context word
    decoded = Dense(vocab_size, activation='softmax')(encoded)

    # The autoencoder is the whole model with hidden layer contected to the output layer.
    autoencoder = Model(input_word, decoded)
    # The encoder is just the input layer connected to the hidden layer. One the Autoencoder will be trained we will use
    # the encoder to create our word vectors 
    encoder = Model(input_word, encoded)
    
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return encoder, autoencoder

In [16]:
# Create word2vec model
encoder, autoencoder = creat_word2vec_model()
autoencoder.summary()

In [17]:
# Train model using batch generator
autoencoder.fit(batch_generator(batch_size, num_skips, skip_window, vocab_size), steps_per_epoch=ceil(len(data) / batch_size), epochs=num_epochs)

Epoch 1/2
[1m6363/6363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 10ms/step - loss: 0.0611
Epoch 2/2
[1m6363/6363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 10ms/step - loss: 0.0024


<keras.src.callbacks.history.History at 0x13bcb5640>

## Use encoder

In [18]:
# Word to one_hot vecotor to dense vector
def vectorize(word):
    word_one_hot = to_one_hot(word2id[word], vocab_size)
    return encoder.predict(np.array([word_one_hot]))[0]

In [19]:
# Create the word2vec dictionary - dictionary of word to dense vector
word2vec = {w : vectorize(w) for w in word2id.keys()}


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34

In [20]:
# We can save the word2vec dictionary to reuse it later.
save_word2vec(our_word2vec, word2vec)

# C- Sentence extraction

<h3> 1_ Create a knowledge base </h3>

In [21]:
equity_keywords = [
    "equity", "equities", "stock", "stocks", "shares", "common", "capital",
    "appreciation", "growth", "dividends", "midcap",
    "technology", "healthcare", "consumer", "industrials", "portfolio"
]
fixed_income_keywords = [
    "bond", "bonds", "debt", "coupon", "yield", "duration", "interest", "income",
    "fixed", "maturity", "treasury", "municipal", "corporate", "principal", "credit", "rates", "stable"
]
balanced_keywords = [
    "balanced", "diversified", "allocation", "blend", "mix", "strategy",
    "equities", "bonds", "stocks", "fixed", "income", "growth", "capital", 
    "preservation", "stability", "moderate", "hybrid", "rebalancing"
]

In [22]:
def get_n_closer(w, n, word2vec):
    vect = word2vec[w]
    distances_dict = {k: cosine(v, vect) for k, v in word2vec.items()}
    
    closer_words = []
    for _ in range(n):
        min_key = min(distances_dict.keys(), key=lambda k: distances_dict[k])
        closer_words.append(min_key)
        del distances_dict[min_key]
    return closer_words

In [23]:
# Creates the knwoledge base by taking the num_neighbors closes neighbors of each key_words in word2vec
def create_knowledge_base(num_neighbors, word2vec, key_words):
    knowledge_base = set()
    out = display(progress(0, len(key_words)-1), display_id=True)
    for ii, key_word in enumerate(key_words) :
        knowledge_base.add(key_word)
        neighbors = []
        try :
            neighbors = get_n_closer(key_word, num_neighbors, word2vec)
        except :
            print(key_word + ' not in word2vec')

        knowledge_base.update(neighbors)
        
        out.update(progress(ii, len(key_words)-1))
    return knowledge_base

In [24]:
#word2vec = load_word2vec(our_word2vec)
eq_knowledge_base = create_knowledge_base(5, word2vec, equity_keywords)
fi_knowledge_base = create_knowledge_base(5, word2vec, fixed_income_keywords)
bal_knowledge_base = create_knowledge_base(5, word2vec, balanced_keywords)

In [25]:
df_extraction = pd.DataFrame({'fund_name' : fund_names, 'summary':summaries})
df_label = pd.read_csv('rag_predictions.csv')
df = df_label.merge(df_extraction, on='fund_name', how='left').dropna()

In [26]:
# Takes a summary, the knowledge base and some hyper parameters and returns the "num_sent" sentences
# of the summary that are closer to the the knowledge base in term of spacial distances.
def extract_sentence_distance(summary, knowledge, n_closer, n_reject, num_sent):
    # Split the summary into sentences.
    sentences = sent_tokenize(summary)
    sentence_scores = []
    # Loop over the sentences.
    for j, sentence in enumerate(sentences):
        # we tokenize and clean the sentence
        tokens = tokenizer(sentence)

        sentence_barycentre = np.zeros(embedding_size)
        effective_len = 0
        # Compute the barycentre of the sentence
        for token in tokens :
            try :
                sentence_barycentre += np.array(word2vec[token])
                effective_len += 1
            except KeyError :
                pass
            except :
                raise
        
        # Reject sentences with less than n_reject words in our word2vec map
        if effective_len <= n_reject :
            sentence_scores.append(1)    

        else :
            sentence_barycentre = sentence_barycentre/effective_len
            # Compute the distance sentece_barycentre -> words in our knowledge base
            barycentre_distance = [cosine(sentence_barycentre, word2vec[key_word]) for key_word in knowledge]
            barycentre_distance.sort()
            # Create the score of the sentence by averaging the "n_closer" smallest distances
            score = np.mean(barycentre_distance[:n_closer])
            sentence_scores.append(score)
    # Select the "num_sent" sentences that have the smallest score (smallest distance score with the knowledge base)
    sentence_scores, sentences = zip(*sorted(zip(sentence_scores, sentences)))
    top_sentences = sentences[:num_sent]
    return ' '.join(top_sentences)

In [27]:
df['eq_sentences'] = df.apply(lambda x : extract_sentence_distance(x['summary'], eq_knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)
df['fi_sentences'] = df.apply(lambda x : extract_sentence_distance(x['summary'], fi_knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)
df['bal_sentences'] = df.apply(lambda x : extract_sentence_distance(x['summary'], bal_knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)

In [28]:
def combine_extractions(string1, string2, string3):
    li = list(set(sent_tokenize(string1) + sent_tokenize(string2)+ sent_tokenize(string3)))
    return ' '.join(li) 

In [29]:
df['sentences'] = df.apply(lambda x : combine_extractions(x['eq_sentences'], x['fi_sentences'], x['bal_sentences']), axis=1)

In [30]:
df.to_csv("dataset3.csv")

## Prepare data for models

In [51]:
X = df['sentences'].values
# X = df['deriv_sentences_distance'].values # uncomment to use the first sentence extraction method.
# Clean the texts
X = [' '.join(tokenizer(txt)) for txt in X]
X[0][:100]

'company becomes insolvent interests investors owning common stock subordinated interests investors g'

In [52]:
y = df['Predicted Investment Strategy'].values
y = np.array([2 if x=='Equity Long Only (Low Risk)' else (1 if x=='Fixed Income Long Only (Low Risk)' else 0) for x in y])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

### Preprocessing Hyperparameters

In [54]:
num_words = 2500 # Size of the vocabulary used. we only consider the 2500 most common words. The other words are removed from the texts.
maxlen = 150 # Number of word considered for each document. we cut or lengthen the texts to have texts of 150 words.
word_dimension = 50 # dimension of our word vectors.

word2vec_g = load_word2vec(our_word2vec)

In [55]:
# The Tokenizer provided by the Keras library allows to perform such transformation.
keras_tokenizer = Tokenizer(num_words=num_words)
keras_tokenizer.fit_on_texts(X_train)
# word_index is the dictionary that contains the index of each words in our 2500 long vocabulary.
word_index = keras_tokenizer.word_index
sequences_train = keras_tokenizer.texts_to_sequences(X_train)
sequences_test = keras_tokenizer.texts_to_sequences(X_test)

In [56]:
def average_word2vec(sequences, word2vec_model, dim):
    features = np.zeros((len(sequences), dim))
    for i, seq in enumerate(sequences):
        valid_vecs = [word2vec_model.get(word) for word in seq if word2vec_model.get(word) is not None]
        if valid_vecs:
            features[i] = np.mean(valid_vecs, axis=0)
        else:
            features[i] = np.zeros(dim)
    return features

In [57]:
feature_train = average_word2vec(sequences_train, word2vec_g, word_dimension)
feature_test = average_word2vec(sequences_test, word2vec_g, word_dimension)

In [62]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

clf = LinearSVC(C=1.0)
clf.fit(feature_train, y_train)

# Step 5: Evaluate
y_pred = clf.predict(feature_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.4838709677419355
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.00      0.00      0.00        28
           2       0.48      1.00      0.65        45

    accuracy                           0.48        93
   macro avg       0.16      0.33      0.22        93
weighted avg       0.23      0.48      0.32        93



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
