In [3]:
#necessary packages
!pip install pandas scikit-learn spacy transformers torch tqdm
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import os
import pandas as pd

def load_data(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                data.append({'filename': filename, 'text': text})
    return pd.DataFrame(data)

# Load data into dataframes
news_data = load_data('data/cleaned/news')
press_releases_data = load_data('data/cleaned/press_releases')
executive_statements_data = load_data('data/cleaned/executive_statements')

#printing the first couple of rows of dataframes
news_data.head(), press_releases_data.head(), executive_statements_data.head()

(                  filename                                               text
 0  cleaned_article_784.txt  Title: PHP addressed critical RCE flaw potenti...
 1  cleaned_article_790.txt  Title: Sticky Werewolf targets the aviation in...
 2  cleaned_article_747.txt  Title: The power of community helps Cisco Insi...
 3  cleaned_article_753.txt  Title: Decoding Hewlett Packard Enterprise Co ...
 4  cleaned_article_586.txt  Title: Global Sports Technology Market Analysi...,
                   filename                                               text
 0  cleaned_article_586.txt  Title: Paramount stock plummets after Shari Re...
 1  cleaned_article_592.txt  Title: Check Point released hotfix for activel...
 2  cleaned_article_579.txt  Title: Forbes Daily: Musk Gets The Votes To Re...
 3   cleaned_article_58.txt  Title: IoT Chips Present a $1.08 Trillion Oppo...
 4  cleaned_article_223.txt  Title: Micron stock rated Overweight on strong...,
                   filename                       

In [7]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd

# Loading the spaCy model
nlp = spacy.load('en_core_web_sm')

# List of known technology companies (includes the 15 I used in the queries)
tech_companies = [
    "Apple", "Microsoft", "Google", "Amazon", "Facebook", "Tesla", "Intel",
    "Cisco", "NVIDIA", "IBM", "Qualcomm", "Oracle", "Texas Instruments",
    "Adobe", "Salesforce", "SAP", "Sony", "Samsung", "LG", "HP", "Dell",
    "ASML", "Broadcom", "Micron", "Xiaomi", "Huawei", "AMD", "ARM Holdings",
    "TSMC", "Nokia", "Ericsson", "Lenovo", "Western Digital", "Seagate",
    "Microchip Technology", "Analog Devices", "Marvell Technology"
]

# Creating a PhraseMatcher instance
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Converting company names to spaCy documents
patterns = [nlp.make_doc(company) for company in tech_companies]
matcher.add("TECH_COMPANIES", patterns)

def identify_companies(text):
    doc = nlp(text)
    matches = matcher(doc)
    companies = [doc[start:end].text for match_id, start, end in matches]
    return companies

# Applying the function to each dataset so that only instances of technology companies lsited are recorded
news_data['companies'] = news_data['text'].apply(identify_companies)
press_releases_data['companies'] = press_releases_data['text'].apply(identify_companies)
executive_statements_data['companies'] = executive_statements_data['text'].apply(identify_companies)

# Display the first 50 rows of each dataframe
print(news_data.head(50))
print(press_releases_data.head(50))
print(executive_statements_data.head(50))

                   filename  \
0   cleaned_article_784.txt   
1   cleaned_article_790.txt   
2   cleaned_article_747.txt   
3   cleaned_article_753.txt   
4   cleaned_article_586.txt   
5   cleaned_article_592.txt   
6   cleaned_article_579.txt   
7    cleaned_article_58.txt   
8   cleaned_article_223.txt   
9    cleaned_article_70.txt   
10  cleaned_article_545.txt   
11  cleaned_article_551.txt   
12   cleaned_article_64.txt   
13  cleaned_article_237.txt   
14  cleaned_article_394.txt   
15  cleaned_article_380.txt   
16  cleaned_article_419.txt   
17  cleaned_article_357.txt   
18  cleaned_article_431.txt   
19  cleaned_article_425.txt   
20  cleaned_article_343.txt   
21  cleaned_article_814.txt   
22  cleaned_article_800.txt   
23  cleaned_article_828.txt   
24  cleaned_article_196.txt   
25  cleaned_article_182.txt   
26  cleaned_article_169.txt   
27  cleaned_article_633.txt   
28  cleaned_article_155.txt   
29  cleaned_article_141.txt   
30  cleaned_article_627.txt   
31  clea

In [8]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

#using cosine similarity in this function
def compare_claims_bert(claims, impacts):
    claim_embeddings = np.array([encode_text(claim) for claim in tqdm(claims, desc="Encoding Claims")])
    impact_embeddings = np.array([encode_text(impact) for impact in tqdm(impacts, desc="Encoding Impacts")])
    claim_embeddings = claim_embeddings.reshape(len(claims), -1)  # Ensure embeddings are 2D
    impact_embeddings = impact_embeddings.reshape(len(impacts), -1)  # Ensure embeddings are 2D
    similarities = cosine_similarity(claim_embeddings, impact_embeddings)
    return similarities

#press releases such as reports and executive statements are claims that companies make while their real environmental efforts are seen through their actual impact on the environment
claims = press_releases_data['text'].tolist() + executive_statements_data['text'].tolist()
impacts = news_data['text'].tolist()

#finds the similarity in the text between what companies claim to do for the environment in their reports and public statements versus what they actuallly do recorded in news articles
similarities = compare_claims_bert(claims, impacts)
similarities

  from .autonotebook import tqdm as notebook_tqdm
Encoding Claims: 100%|███████████████████████████████████████| 1078/1078 [01:17<00:00, 13.88it/s]
Encoding Impacts: 100%|████████████████████████████████████████| 886/886 [01:04<00:00, 13.65it/s]


array([[0.76230717, 0.79171824, 0.72687143, ..., 0.7945968 , 0.75089526,
        0.76616   ],
       [0.8802837 , 0.85038215, 0.6963753 , ..., 0.8692026 , 0.8959359 ,
        0.8591518 ],
       [0.8160468 , 0.82062805, 0.79297835, ..., 0.81503415, 0.781049  ,
        0.81577307],
       ...,
       [0.9076627 , 0.9150633 , 0.7698947 , ..., 0.9086882 , 0.90962446,
        0.89794254],
       [0.7588328 , 0.7864812 , 0.7138654 , ..., 0.8014643 , 0.75669825,
        0.7759634 ],
       [0.79767656, 0.82960427, 0.79151654, ..., 0.7965802 , 0.7779137 ,
        0.75014675]], dtype=float32)

In [9]:
def quantify_similarity(similarities):
    return (similarities.mean() * 100).astype(int)

# Quantifies similarity between claims and impact for each company
company_similarity = {}
for company in set(news_data['companies'].explode().dropna()):
    company_news = news_data[news_data['companies'].apply(lambda x: company in x)]
    company_claims = press_releases_data[press_releases_data['companies'].apply(lambda x: company in x)]
    company_claims = pd.concat([company_claims, executive_statements_data[executive_statements_data['companies'].apply(lambda x: company in x)]])

    if not company_news.empty and not company_claims.empty:
        news_claims = company_news['text'].tolist()
        claims = company_claims['text'].tolist()
        similarities = compare_claims_bert(claims, news_claims)
        company_similarity[company] = quantify_similarity(similarities)

# creates dataframe of results
company_similarity_df = pd.DataFrame(list(company_similarity.items()), columns=['Company', 'Similarity'])
print(company_similarity_df)

Encoding Claims: 100%|███████████████████████████████████████████| 31/31 [00:02<00:00, 13.07it/s]
Encoding Impacts: 100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 13.91it/s]
Encoding Claims: 100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.70it/s]
Encoding Impacts: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 13.84it/s]
Encoding Claims: 100%|███████████████████████████████████████████| 41/41 [00:02<00:00, 13.90it/s]
Encoding Impacts: 100%|██████████████████████████████████████████| 19/19 [00:01<00:00, 14.01it/s]
Encoding Claims: 100%|█████████████████████████████████████████████| 3/3 [00:00<00:00,  7.49it/s]
Encoding Impacts: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 14.72it/s]
Encoding Claims: 100%|███████████████████████████████████████████| 77/77 [00:05<00:00, 13.79it/s]
Encoding Impacts: 100%|██████████████████████████████████████████| 66/66 [00:05<00:00, 12.96it/s]
Encoding Claims: 100

            Company  Similarity
0           Samsung          83
1               IBM         100
2            Amazon          83
3             Adobe          94
4         Microsoft          83
5        Salesforce          87
6              Dell          81
7            Huawei          85
8              Sony          83
9             Cisco          84
10           Nvidia          86
11            intel          81
12           Lenovo          92
13           Google          83
14         Qualcomm          87
15             TSMC         100
16         Facebook          95
17  Western Digital         100
18               HP          86
19           NVIDIA          85
20           Oracle          90
21           Micron          83
22            Tesla          86
23              SAP          91
24            Apple          84
25              AMD          85
26            Intel          87





In [11]:
#Saves the company similarity scores to a CSV file 
output_df = pd.DataFrame(similarities)
output_df.to_csv('data/output/claim_verification_results_bert.csv', index=False)
company_similarity_df.to_csv('data/output/company_similarity_results_bert.csv', index=False)