# Rodz Andrie Amor

Note: This was also meant to run in Google Collab.

Textual Semantic Similarity Model

## Purpose:
- Given a label or headline, determine if there are any words or sequences in each of the companies SEC 10k filings in the 1A (risk) section that have a high similarity score to the event label given.
- This indicates that the company has mentioned the risk in their filings.

## Specifications:
- The company sec filings are extracted in another document and put into a CSV form.
- The CSV contains the ticker, filing date, and cleaned risk section text.
- If a date is given, I need to select the relevant filing that contains the date range.
- Perform semantic similarity analyis.

### Important Note
- Risk Sections involve a myriad of topics. Even if a headline is unequivocally discussed in the risk section, the similarity models may calculate a low similarity score because it is drowned out by several other topics that are mentioned.
  - Solution: Segment the texts using NLP libraries and use the highest similarity score. Additionally, increase the threshold.

## Ideas:
- RoBERTa model used for semantic similarity.
- Named Entity Recognition (NER)
  - According to Wikipedia:
  - <b>Named-entity recognition (NER)</b> (also known as (named) entity identification, entity chunking, and entity extraction) is a subtask of information extraction that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.
- Use pretrained NER models
- Other ideas: Risk classification as binary or multi-class identification

In [None]:
!pip install transformers sentence-transformers
!pip install -U spacy
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
import spacy
import re
# import jellyfish
# from semantic_text_similarity.models import WebBertSimilarity
# from concurrent.futures import ProcessPoolExecutor, as_completed
from sentence_transformers import SentenceTransformer, util

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

In [None]:
def load_data(file_path):
    return pd.read_csv(file_path)

def segment_text(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  # Splits by sentences
  return [sent.text.strip() for sent in doc.sents]

# Ensemble because we this uses every model and evaluates them all
def ensemble_semantic_analysis(file_path, headlines, models):
    data = load_data(file_path)

    all_results = []

    for model_name in models:
        model = SentenceTransformer(models[model_name])

        for index, row in data.iterrows():
            document = row["Risk Factors Text"]
            document_embedding = model.encode(document, convert_to_tensor=True)

            # Initialize or update result_row for each document
            if index >= len(all_results):
                result_row = {
                    "Risk Factors Text": document,
                    "Ticker": row["Ticker"],
                    "Company Name": row["Company Name"]
                }
                all_results.append(result_row)
            else:
                result_row = all_results[index]

            for headline_name, headline_text in headlines.items():
                label_embedding = model.encode(headline_text, convert_to_tensor=True)
                similarity = util.pytorch_cos_sim(label_embedding, document_embedding).item()

                # Assign evaluations based on similarity score
                if similarity > 0.6:
                    evaluation = "High"
                elif similarity > 0.3:
                    evaluation = "Medium"
                else:
                    evaluation = "Low"

                # Append model-specific similarity and evaluation
                prefix = f"{headline_name} {model_name}"
                result_row.update({
                    f"{prefix} Similarity Score": similarity,
                    f"{prefix} Similarity Evaluation": evaluation
                })

    return pd.DataFrame(all_results)


def semantic_analysis(file_path, headlines, model_name):
    data = load_data(file_path)
    model = SentenceTransformer(model_name)

    results = []

    for index, row in data.iterrows():
        document = row["Risk Factors Text"]
        segments = segment_text(document)  # Segment the document into sentences

        # Initialize the result row for this document
        result_row = {
            "Ticker": row["Ticker"],
            "Company Name": row["Company Name"],
            "Model Used": model_name
        }

        for headline_name, headline_text in headlines.items():
            headline_embedding = model.encode(headline_text, convert_to_tensor=True)
            highest_similarity = 0  # Track the highest similarity for this headline
            highest_similarity_segment = ""  # Track the text of the highest similarity segment

            # Check each segment for similarity
            for segment in segments:
                segment_embedding = model.encode(segment, convert_to_tensor=True)
                similarity = util.pytorch_cos_sim(headline_embedding, segment_embedding).item()

                if similarity > highest_similarity:
                    highest_similarity = similarity
                    highest_similarity_segment = segment

            # Assign evaluations based on the highest similarity score found
            if highest_similarity > 0.6:
                evaluation = "High"
            elif highest_similarity > 0.3:
                evaluation = "Medium"
            else:
                evaluation = "Low"

            result_row.update({
                f"{headline_name}": headline_text,
                f"{headline_name} Highest Similarity Score": highest_similarity,
                f"{headline_name} Similarity Evaluation": evaluation,
                f"{headline_name} Risk Section Representative Segment": highest_similarity_segment
            })

        results.append(result_row)

    return pd.DataFrame(results)

In [None]:
# Example Weather Headline
# Example Oil Headline
weather_headline = "Category 5 Hurricane on course to decimate Florida. Residents and businesses in a scramble to avoid to storm."
political_headline = "United States sanctions against Russia has caused rough economic conditions for the global oil industry."

# You can find other ones to experiment with as well
all_models = {
    "Small": "all-MiniLM-L6-v2",
    "Medium": "all-MiniLM-L12-v2",
    "Large": "all-roberta-large-v1",
}

headlines = {
    "Weather Headline": "Category 5 Hurricane on course to decimate Florida. Residents and businesses in a scramble to avoid the storm.",
    "Political Headline": "United States sanctions against Russia has caused rough economic conditions for the global oil industry."
}

XOM_similarity_df = semantic_analysis("XOM_sec10k_risk_corpus.csv", headlines, all_models["Medium"])
XOM_similarity_df.to_csv("XOM Semantic Similarity Results.csv", index=False)

# XOM_similarity_df = ensemble_semantic_analysis("XOM_sec10k_risk_corpus.csv", headlines, all_models)
# XOM_similarity_df.to_csv("XOM Semantic Similarity Results.csv", index=False)

display(XOM_similarity_df)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



Unnamed: 0,Ticker,Company Name,Model Used,Weather Headline,Weather Headline Highest Similarity Score,Weather Headline Similarity Evaluation,Weather Headline Risk Section Representative Segment,Political Headline,Political Headline Highest Similarity Score,Political Headline Similarity Evaluation,Political Headline Risk Section Representative Segment
0,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.516203,Medium,"World oil, gas, and petrochemical supply level..."
1,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.517244,Medium,"World oil, gas, and petrochemical supply level..."
2,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.517014,Medium,"World oil, gas, and petrochemical supply level..."
3,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.515843,Medium,"World oil, gas, and petrochemical supply level..."
4,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.502641,Medium,"World oil, gas, and petrochemical supply level..."
5,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.502641,Medium,"World oil, gas, and petrochemical supply level..."
6,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.494561,Medium,"World oil, gas, and petrochemical supply level..."
7,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.494561,Medium,"World oil, gas, and petrochemical supply level..."
8,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.494561,Medium,"World oil, gas, and petrochemical supply level..."
9,XOM,EXXON MOBIL CORP,all-MiniLM-L12-v2,Category 5 Hurricane on course to decimate Flo...,0.449605,Medium,"For example, hurricanes may damage our offshor...",United States sanctions against Russia has cau...,0.494561,Medium,"World oil, gas, and petrochemical supply level..."


# Named Entity Recognition



In [None]:
# nlp = spacy.load("en_core_web_sm")

# # Extract entities using spaCy's NER
# def extract_entities(headline):
#     doc = nlp(headline)
#     return set(ent.text for ent in doc.ents)

# def keyword_search(text, keywords):
#     for keyword in keywords:
#         if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
#             return True
#     return False

# print(extract_entities(weather_headline))
# print(extract_entities(political_headline))

# Just some old code I used for my SWE job

## Thought it would be a useful reference here since we're doing semantic similarity analysis

In [None]:
# Just some old previous code I used for my SWE job
# Thought it would be a useful reference here

# pepsi_applications = pd.read_csv("prod_apps.csv")

# # Scaled [0,5]
# print("Loading RoBERTa Similarity Model")
# web_model = WebBertSimilarity(device="gpu")
# print("Finished Loading Language Model")

# count = 0

# org_name_lengths = [len(str(pepsi_applications.loc[i, "org_name"])) for i in range(len(pepsi_applications))]
# num_org_names = len(pepsi_applications)

# pairs_to_evaluate = []

# batch_size = 64
# # print(f"There will be {len(pairs_to_evaluate)} pairs to evaluate for approximately {len(pairs_to_evaluate) / batch_size} batches.")
# # print(f"There will be {len(pairs_to_evaluate)} pairs to evaluate for approximately {len(pairs_to_evaluate) / batch_size} batches.")

# for i in range(num_org_names):
#     org_name_i = str(pepsi_applications.loc[i, "org_name"])
#     for j in range(i + 1, num_org_names):
#         org_name_j = str(pepsi_applications.loc[j, "org_name"])

#         pairs_to_evaluate.append((i, j, org_name_i, org_name_j))

# def process_batch(batch):
#     local_matches = {}
#     semantic_similarities = web_model.predict([(org_name_i, org_name_j) for _, _, org_name_i, org_name_j in batch])

#     for j, (index_i, index_j, org_name_i, org_name_j) in enumerate(batch):
#         semantic_similarity = semantic_similarities[j]

#         damerau_distance = jellyfish.damerau_levenshtein_distance(org_name_i, org_name_j)
#         text_similarity = 1 - damerau_distance / max(org_name_lengths[index_i], org_name_lengths[index_j])

#         if semantic_similarity > 3 or text_similarity > 0.75:
#             if pepsi_applications.loc[index_i, "application_id"] not in local_matches:
#                 local_matches[pepsi_applications.loc[index_i, "application_id"]] = {
#                     "Current Organization Name": pepsi_applications.loc[index_i, "org_name"],
#                     "Matched Application ID": [pepsi_applications.loc[index_j, "application_id"]],
#                     "Matched Organization Name": [pepsi_applications.loc[index_j, "org_name"]]
#                 }
#             else:
#                 local_matches[pepsi_applications.loc[index_i, "application_id"]]["Matched Application ID"].append(pepsi_applications.loc[index_j, "application_id"])
#                 local_matches[pepsi_applications.loc[index_i, "application_id"]]["Matched Organization Name"].append(pepsi_applications.loc[index_j, "org_name"])

#     return local_matches

# # Main code
# matches_df = {}

# with ProcessPoolExecutor() as executor:
#     futures = [executor.submit(process_batch, pairs_to_evaluate[i:i + batch_size]) for i in range(0, len(pairs_to_evaluate), batch_size)]

#     for future in futures:
#         local_matches = future.result()
#         for key, value in local_matches.items():
#             if key not in matches_df:
#                 matches_df[key] = value
#             else:
#                 matches_df[key]["Matched Application ID"].extend(value["Matched Application ID"])
#                 matches_df[key]["Matched Organization Name"].extend(value["Matched Organization Name"])

#
# matches_df = pd.DataFrame.from_dict(matches_df, orient="index")
# matches_df.index.name = "Current Application ID"
# matches_df.to_csv(f"Prod_Matches.csv")
# print(matches_df)