In [1]:
# Libraries for parsing data
import pandas as pd
from io import StringIO
from html.parser import HTMLParser
import os
import spacy
from lxml import etree
import sys
import math

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np




In [2]:
pd.set_option("display.max_columns",None)

In [None]:
pd.set_option("display.max_rows",None)

Load Spacy model

In [None]:
# Get spaCy parser for sentence segmentation
#nlp = spacy.load('../../Resources/Models/en_core_web_sm-3.2.0/en_core_web_sm/en_core_web_sm-3.2.0')

# Use above commented code line for actual release, this is just for testing on this instance
nlp = spacy.load('/home/ec2-user/SageMaker/Getting Started/2022.05.25/Resources/Models/en_core_web_sm-3.2.0/en_core_web_sm/en_core_web_sm-3.2.0')
nlp.max_length = 2000000



Load Lexicon file

In [None]:
# Vader tab-delimited lexcion with TOKEN, MEAN-SENTIMENT-RATING, STANDARD DEVIATION, and RAW-HUMAN-SENTIMENT-RATINGS
#wordsentiment = '../../Resources/vader_lexicon.txt'

# Use above commented code line for actual release, this is just for testing on this instance
wordsentiment = 'vader_lexicon.txt'

In [None]:
wordsent_dict = {}

with open(wordsentiment, encoding='utf8') as f:
    lines = f.readlines()

    # Retrieve just the TOKEN and MEAN-SENTIMENT-RATING
    for i in lines:
        row = i.split('\t')
        wordsent_dict[row[0]] = row[1]

In [None]:
# HTML Stripping Class
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    """Remove HTML tags from the provided HTML text"""
    s = MLStripper()
    s.feed(html)
    return s.get_data()


In [None]:
def getxmlcontent(file_path, strip_html=True):
    # Retrieve metadata from XML document
    try:
        tree = etree.parse(file_path)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None

        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find(".//MpubId") is not None:
            pub_id = root.find(".//MpubId").text
        else:
            pub_id = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        elif root.find('.//AbsText') is not None:
            text = root.find('.//AbsText').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_tags(text)

    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
        return None

    return {
            "goid": goid,
            "title": title,
            "date": date,
            "publisher": publisher,
            "pub_id": pub_id,
            "text": text
        }

In [None]:
# Function to compute sentiment score for a sentence
def compute_sentiment(sentence):
    score = 0
    count = 0

    for word in sentence:
        word = str(word).lower()
        if word in wordsent_dict:
            score += float(wordsent_dict[word])
            count += 1
    return score / count if count > 0 else 0  # Average sentiment per sentence

In [None]:
from tqdm import tqdm

def process_dataset(dataset_path, max_text_length=2000000, truncate_text=False, log_every=50):
    article_records = []
    companies = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]

    for company in tqdm(companies, desc="Processing companies"):
        company_path = os.path.join(dataset_path, company)
        files = [f for f in os.listdir(company_path) if f.endswith(".xml")]

        for idx, file in enumerate(tqdm(files, desc=f"Processing {company}", leave=False)):
            file_path = os.path.join(company_path, file)
            data = getxmlcontent(file_path)

            if not data:
                print(f"{file} in {company} (empty)")
                continue

            if not data["text"]:
                print(f"{file} in {company} (notext)")
                continue

            # Optionally skip very large texts entirely
            if len(data["text"]) > max_text_length:
                if truncate_text:
                    data["text"] = data["text"][:max_text_length]
                else:
                    print(f"Skipping {file} in {company} (text too long)")
                    continue

            # Process with SpaCy
            try:
                doc = nlp(data["text"])
            except Exception as e:
                print(f"Error in SpaCy parsing {file}: {e}")
                continue

            scores = [compute_sentiment(sent) for sent in doc.sents]
            mean_score = sum(scores) / len(scores) if scores else 0

            article_records.append({
                "Company": company,
                "Article": file,
                "GOID": data["goid"],
                "Title": data["title"],
                "Date": data["date"],
                "Publisher": data["publisher"],
                "Pub_ID": data["pub_id"],
                "Mean_Sentiment_Score": mean_score,
                "Num_Sentences": len(scores)
            })

            # Optional logging
            if idx % log_every == 0 and idx > 0:
                print(f"[{company}] Processed {idx} articles...")

    df = pd.DataFrame(article_records)
    return df


In [None]:

# Run the processing function
dataset_path = "/home/ec2-user/SageMaker/data/"  # Change this to your actual dataset path
df_test = process_dataset(dataset_path)

In [7]:
df_result = df = pd.read_csv("/content/df_results.csv")

In [None]:
df_result

Unnamed: 0.1,Unnamed: 0,Company,Article,GOID,Title,Date,Publisher,Pub_ID,Mean_Sentiment_Score,Num_Sentences\
0,0,DOWLAIS_GROUP_PLC,2814916176.xml,2814916176,Melrose Industries (MRO: GBX490.30) keeps risi...,2023-05-18,News Bites Pty Ltd,2044968,0.578248,275\
1,1,DOWLAIS_GROUP_PLC,2805307362.xml,2805307362,MARKETS [Edition 2],2023-04-25,Daily Telegraph,33554,0.329167,12\
2,2,DOWLAIS_GROUP_PLC,2813868455.xml,2813868455,Schaeffler AG,2023-05-09,JPMorgan Chase & Company,2031016,0.296529,269\
3,3,DOWLAIS_GROUP_PLC,2808277618.xml,2808277618,Daimler Truck Holding AG,2023-04-24,JPMorgan Chase & Company,2031016,0.284610,254\
4,4,DOWLAIS_GROUP_PLC,2804210100.xml,2804210100,"Mining firms hit but Footsie stays above 7,900",2023-04-22,Gannett Media Corp,45319,0.792063,21\
...,...,...,...,...,...,...,...,...,...,...
10950,10950,AOTI_INC,3055851815.xml,3055851815,HONG KONG: CHEMICALS SECTOR PULSE DAILY FRIDAY...,2024-05-17,News Bites Pty Ltd,2044968,0.256840,461\
10951,10951,AOTI_INC,3067825159.xml,3067825159,Financial Services Roundup: Market Talk,2024-06-13,Dow Jones & Company Inc.,2044531,0.657213,58\
10952,10952,AOTI_INC,3074862941.xml,3074862941,"HKD17.50) jumps 6.6% on high volume, hits 34-d...",2024-07-02,News Bites Pty Ltd,2044968,0.702529,291\
10953,10953,AOTI_INC,3055851842.xml,3055851842,HONG KONG: METALS &amp; MINING SECTOR PULSE DA...,2024-05-17,News Bites Pty Ltd,2044968,0.238975,379\


In [8]:
variables = pd.read_excel("/content/variables.xlsx")

In [9]:
variables

Unnamed: 0,IPO_Date,IPO_Date.1,IPO_Date +30,IPO_Ticker,IPO_Company_Name,PE/VC_backing,Underwriter_Rating,Age at IPO,Log_IPO_Issue_Size,FTSE_100_IVI,Market_Performance,Technology_Dummy,Healthcare_Dummy,Financials_Dummy,Energy_Utilities_Dummy,Basic_Materials_Dummy,Industrials_Dummy,Consumer_Dummy,RealEstate_Dummy,Founder_Control_dummy,High_Concentration_Dummy,Strategic_Ownership_Dummy,Article_Count,First_Day_Performance,First_Month_performance
0,2021-02-17,2021-02-17,2021-03-19,4BB.L,4BASEBIO_PLC,False,2,1,£16.49,16.36,0.0064,0,1,0,0,0,0,0,0,1,1,1,34,0.046931,-0.120690
1,2021-12-07,2021-12-07,2022-01-06,4GBL.L,4GLOBAL_PLC,False,3,19,£15.11,17.24,0.0228,1,0,0,0,0,0,0,0,0,0,0,39,0.223684,-0.120430
2,2024-07-01,2024-07-01,2024-07-31,AGVI.L,ABERFORTH_GEARED_VALUE_INCOME_TRUST_PLC,False,1,1,£18.90,20.33,-0.0274,0,0,1,0,0,0,0,0,0,1,1,78,0.000000,-0.148649
3,2021-03-29,2021-03-29,2021-04-28,AOM.L,ACTIVEOPS_PLC,True,2,40,£18.14,13.15,-0.0426,1,0,0,0,0,0,0,0,0,0,0,17,0.200000,-0.083333
4,2024-02-09,2024-02-09,2024-03-10,AIRAq.L,AIR_ASTANA_JOINT_STOCK_COMPANY,False,1,23,£14.11,14.24,0.0385,0,0,0,0,0,0,1,0,0,1,1,358,0.125000,-0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,2021-10-08,2021-10-08,2021-11-07,WPS.L,WAG_PAYMENT_SOLUTIONS_PLC,True,1,20,£19.04,20.60,0.0253,0,0,0,0,0,1,0,0,1,1,0,68,-0.082568,-0.129714
153,2021-04-28,2021-04-28,2021-05-28,WIX.L,WICKES_GROUP_PLC,True,1,2,no new issue,17.26,0.0241,0,0,0,0,0,0,1,0,0,1,0,41,0.475000,0.195763
154,2021-12-06,2021-12-06,2022-01-05,WNWD.L,WINDWARD_LTD,True,2,11,£17.36,17.26,0.0241,1,0,0,0,0,0,0,0,0,0,0,86,0.235795,0.200000
155,2021-07-07,2021-07-07,2021-08-06,WISEa.L,WISE_PLC,True,1,10,£18.20,16.16,-0.0028,0,0,0,0,0,1,0,0,1,1,0,92,0.016234,-0.252396


In [10]:
variables = variables.rename(columns={"IPO_Company_Name":"Company"})

In [13]:
# First, make sure both date columns are in datetime format
df_result["Date"] = pd.to_datetime(df_result["Date"], errors="coerce")
variables["IPO_Date"] = pd.to_datetime(variables["IPO_Date"], errors="coerce")

# Merge the IPO info into the article DataFrame by company
df_merged = pd.merge(df_result, variables, on="Company", how="left")

# Add a new column to classify each article as 'Pre-IPO' or 'Post-IPO'
df_merged["IPO_Status"] = df_merged.apply(
    lambda row: "Pre-IPO" if pd.notnull(row["Date"]) and pd.notnull(row["IPO_Date"]) and row["Date"] < row["IPO_Date"]
    else "Post-IPO" if pd.notnull(row["Date"]) and pd.notnull(row["IPO_Date"]) and row["Date"] >= row["IPO_Date"]
    else "Unknown",
    axis=1
)

# Group by company and IPO_Status and calculate mean sentiment
df_summary = df_merged.groupby(["Company", "IPO_Status"])["Mean_Sentiment_Score"].mean().reset_index()



In [None]:
data_to_export = "./ian2 (1).ipynb"
!aws s3 cp ""$data_to_export" s3://pq-tdm-studio-results/tdm-ale-data/a2359/results/

/bin/sh: -c: line 0: syntax error near unexpected token `('
/bin/sh: -c: line 0: `aws s3 cp ""./ian2 (1).ipynb" s3://pq-tdm-studio-results/tdm-ale-data/a2359/results/'


Regression 1) Pre-IPO Sentiment x First Day Performance

First_Day_Return=β0+β1⋅Pre_IPO_Sentiment+β2⋅PE/VC_Backing+β3⋅Underwriting_Rating+β4⋅Log_of_Age+β5⋅IPO_Issue_Size+β6⋅Market_Volatility+ϵ

Regression 2) Post-IPO Sentiment x First Month Performance

First_Month_Return=β0+β1⋅Post_IPO_Sentiment+β2⋅PE/VC_Backing+β3⋅Underwriting_Rating+β4⋅Log_of_Age+β5⋅IPO_Issue_Size+β6⋅Market_Sentiment_First_Month+ϵ

Now this script is taking a folder containing containing several articles and puts them all together in one big text. I then divides it into sentences. It loops over each sentence and calulates the sentiment score for each token in the sentence. Then an average sentiment score is calulated for each sentence.

Questions: Is this the best option? Or calculate average for each document: The same, alternative: Ratio positive/negative articles.