In [62]:
#Download boolnlp
!pip install booknlp

In [None]:
!python3 -m spacy download en_core_web_sm

In [15]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('news_cleaned_no_spaces.csv', encoding='ISO-8859-1')

# Verify the first few rows to ensure correct loading
print("Verifying the first few rows of the DataFrame:")
print(df.head())

# Open a new text file for writing the articles
with open('articles_for_model_test.txt', 'w', encoding='utf-8') as file:
    # Iterate over the desired range, here assumed to correctly start from the second row of data
    for index in range(0, 100):  # This should correctly select rows 2 to 12 if the first row is the header
        # Write the news_text of each article to the file, adding two newlines for paragraph separation
        file.write(df.iloc[index]['news_text'] + '\n\n')

print("Articles from rows 2 to 13 have been successfully written to articles_for_model_test.txt.")


Verifying the first few rows of the DataFrame:
                                               title  \
0  NVDA: Will These Semiconductor Stocks Deliver ...   
1               3 Cheap Tech Stocks to Buy Right Now   
2  Nvidia's Valuation Sparks Reddit Debate: Echoe...   
3  Spotlight on Cisco Systems: Analyzing the Surg...   
4  If You Like Nvidia, Then You Will Love These 2...   

                                                 url time_published  \
0  https://stocknews.com/news/nvda-tsm-avgo-csco-...            NaN   
1  https://www.fool.com/investing/2024/02/12/3-ch...   2/12/24 0:00   
2  https://www.benzinga.com/trading-ideas/long-id...            NaN   
3  https://www.benzinga.com/insights/options/24/0...            NaN   
4  https://www.fool.com/investing/2024/02/11/if-y...   2/11/24 0:00   

                                    authors  \
0                                       NaN   
1                                   Leo Sun   
2                               Surbhi Jain   
3

In [16]:
from booknlp.booknlp import BookNLP

model_params={
		"pipeline":"entity,quote,supersense,event,coref", 
		"model":"big"
	}
	
booknlp=BookNLP("en", model_params)

{'pipeline': 'entity,quote,supersense,event,coref', 'model': 'big'}
--- startup: 9.180 seconds ---


In [17]:
# Input file to process
input_file="articles_for_model_test.txt"

# Output directory to store resulting files in
output_directory="newsbook"

# File within this directory will be named ${book_id}.entities, ${book_id}.tokens, etc.
book_id="newsbook"

booknlp.process(input_file, output_directory, book_id)

--- spacy: 21.174 seconds ---
--- entities: 185.392 seconds ---
--- quotes: 0.072 seconds ---
--- attribution: 22.194 seconds ---
--- name coref: 1.029 seconds ---
--- coref: 159.905 seconds ---
--- TOTAL (excl. startup): 390.198 seconds ---, 121389 words


In [18]:
import pandas as pd

df = pd.read_csv("newsbook/newsbook.tokens", delimiter="\t")
df = df[["paragraph_ID", "sentence_ID", "word", "lemma", "event"]]
df

Unnamed: 0,paragraph_ID,sentence_ID,word,lemma,event
0,0,0,Despite,despite,O
1,0,0,macroeconomic,macroeconomic,O
2,0,0,challenges,challenge,O
3,0,0,",",",",O
4,0,0,the,the,O
...,...,...,...,...,...
121384,100,4855,for,for,O
121385,100,4855,the,the,O
121386,100,4855,long,long,O
121387,100,4855,haul,haul,O


In [39]:
# TEST
import pandas as pd
import re
from sklearn.metrics import precision_recall_fscore_support

# Adjusted function to handle possessive forms and normalize sentences, keeping apostrophes
# ... [same as before]
def preprocess_and_normalize_sentence(sentence):
    # Handle possessive forms that are split into two tokens (e.g., "Benzinga 's" -> "Benzinga's")
    sentence = re.sub(r"\b(\w+)\s's\b", r"\1's", sentence)
    
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Keep the apostrophe while removing other punctuation
    sentence = re.sub(r"[^\w\s']", '', sentence)
    
    # Remove extra spaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    
    return sentence

# Function to extract and normalize golden truth sentences
def get_golden_truth_sentences(golden_df):
    golden_sentences_by_paragraph = {}
    # The row index after skiprows starts from 0, which aligns with paragraph_ID starting from 0
    for index, row in golden_df.iterrows():
        sentences = [sentence.strip() for sentence in row['gold_truth'].split(". ") if sentence]
        golden_sentences_by_paragraph[index] = [preprocess_and_normalize_sentence(s) for s in sentences]
    return golden_sentences_by_paragraph

# Function to extract events from the tokens file and group by paragraph_ID
def extract_events_by_paragraph(file_path):
    df = pd.read_csv(file_path, delimiter="\t")
    # ... [your existing code for updating 'event' column based on financial dictionary]
    #Financial dictionary
    financial_dictionary = [
    "acquisition", "merger", "bankruptcy", "dividend", "earnings", "forecast", "growth", "inflation", "interest", 
    "investment", "liquidity", "margin", "profit", "revenue", "shareholder", "stock", "trade", "valuation", "yield", 
    "default", "devaluation", "expansion", "hedge", "leverage", "option", "portfolio", "rating", "risk", "sector", 
    "volatility", "write-off", "amortization", "arbitrage", "capital", "derivative", "equity", "funding", "index", 
    "IPO", "liquidation", "maturity", "option", "payout", "recession", "split", "stake", "tender", "turnover", 
    "underwriting", "venture", "warrant", "adjustment", "alliance", "bid", "buyout", "collateral", "coupon", "debt", 
    "deficit", "dilution", "divestiture", "endorsement", "exposure", "financing", "gearing", "hedging", "incentive", 
    "joint", "leverage", "moat", "notional", "overhead", "premium", "quota", "refinancing", "short", "speculation", 
    "swap", "tariff", "tranche", "upside", "vesting", "write-down", "zoning", "audit", "bailout", "benchmark", "bubble", "bull", "bear", "capitalization", "ceiling", "clearing", "compliance", 
    "contraction", "conversion", "crash", "credit", "currency", "depreciation", "downturn", "easing", "embargo", 
    "emerging", "equities", "escalation", "exemption", "expatriation", "fee", "fluctuation", "foreclosure", "glitch", 
    "guarantee", "hedge", "impound", "injunction", "insolvency", "integration", "interest", "intermediary", "laundering", 
    "leakage", "lockout", "meltdown", "monopoly", "moratorium", "nominee", "oligopoly", "outlook", "overvaluation", 
    "panic", "parity", "patent", "penalty", "pension", "plunge", "proxy", "rally", "rebound", "recapitalization", 
    "reform", "regulation", "restructuring", "retirement", "rollback", "sanction", "scandal", "shortage", "slump", 
    "spike", "spinoff", "stagnation", "stimulus", "subsidy", "surge", "takeover", "tariff", "taxation", "trend", 
    "underperform", "valuation", "volunteer", "windfall", "withdrawal", "writeup"]


    # Normalize words in the financial dictionary for consistent matching
    financial_dictionary = [word.lower() for word in financial_dictionary]
    
    # Update the 'event' column based on the financial dictionary
    # Check if the lowercase version of each word is in the financial dictionary
    df['event'] = df.apply(lambda row: "EVENT" if row['word'].lower() in financial_dictionary else row['event'], axis=1)
    
    
    # Dictionary to hold lists of sentences (events) keyed by paragraph_ID
    events_by_paragraph = {}
    
    # Iterate over each sentence_ID
    for sentence_id in df['sentence_ID'].unique():
        # Select the entire sentence that contains at least one 'EVENT'
        sentence_df = df[df['sentence_ID'] == sentence_id]
        if 'EVENT' in sentence_df['event'].values:
            paragraph_id = sentence_df.iloc[0]['paragraph_ID']
            # Reconstruct the full sentence
            sentence = " ".join(sentence_df['word'].tolist())
            
            if paragraph_id not in events_by_paragraph:
                events_by_paragraph[paragraph_id] = []
            events_by_paragraph[paragraph_id].append(sentence)
    
    return events_by_paragraph

# Read specific rows (2 to 12) from the CSV file for the golden truth, without skipping the header
golden_df = pd.read_csv('news_cleaned_no_spaces.csv', nrows=100, usecols=['gold_truth'], skiprows=0, encoding='ISO-8859-1')

# Extract and normalize golden truth sentences
golden_sentences_by_paragraph = get_golden_truth_sentences(golden_df)

# After extracting events for each paragraph
events_by_paragraph = extract_events_by_paragraph('newsbook/newsbook.tokens')



In [40]:
# Initialize counters for True Positives (TP), False Positives (FP), False Negatives (FN), and matches per paragraph
total_TP = 0
total_FP = 0
total_FN = 0
matches_per_paragraph = {}  # Track matches for each paragraph ID

# Iterate over each paragraph ID in the model output
for paragraph_id, model_sentences in events_by_paragraph.items():
    # Retrieve the corresponding golden truth sentences for the current paragraph ID
    golden_truth_sentences = golden_sentences_by_paragraph.get(paragraph_id, [])
    
    # For each paragraph, initialize a set to track which golden truth sentences have been matched
    matched_golden_sentences = set()

    # Iterate through each model sentence and check if it is a superset of any golden truth sentence
    for model_sentence in model_sentences:
        found_match = False
        # Apply normalization to the constructed sentence
        normalized_sentence = preprocess_and_normalize_sentence(model_sentence)
        for truth_sentence in golden_truth_sentences:
            # Check if the model sentence is a superset of the golden truth sentence
            if truth_sentence in normalized_sentence:
                found_match = True
                matched_golden_sentences.add(model_sentence)
                break  # Found a match, no need to check the rest of the golden truth sentences for this model sentence
        
        if found_match:
            total_TP += 1  # The model sentence matched a golden truth sentence
        else:
            total_FP += 1  # The model sentence did not match any golden truth sentence

    # Count unmatched golden truth sentences as False Negatives
    total_FN += len(golden_truth_sentences) - len(matched_golden_sentences)
    # Record the number of matches for the current paragraph
    matches_per_paragraph[paragraph_id] = len(matched_golden_sentences)

# Calculate Precision, Recall, and F1-Score
Precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
Recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
F1_Score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0

print(f"Precision: {Precision:.4f}, Recall: {Recall:.4f}, F1-Score: {F1_Score:.4f}")

# Find the paragraph ID with the most matches
most_matches_paragraph_id = max(matches_per_paragraph, key=matches_per_paragraph.get)
print(f"Paragraph ID with the most matches: {most_matches_paragraph_id} ({matches_per_paragraph[most_matches_paragraph_id]} matches)")


# Total number of golden truth sentences
total_golden_sentences = sum(len(sentences) for sentences in golden_sentences_by_paragraph.values())

print(f"Total number of golden truth sentences: {total_golden_sentences}")

Precision: 0.1380, Recall: 0.3302, F1-Score: 0.1947
Paragraph ID with the most matches: 14 (18 matches)
Total number of golden truth sentences: 754
