In [5]:
from dash import Dash, html, dcc
from src.components import ids
import base64
from dash.dependencies import Input, Output, State
import pdftotext
import io 
import PyPDF2
from PyPDF2 import PdfReader
import fitz  # Import PyMuPDF as fitz

import ipywidgets as widgets
from IPython.display import display

In [15]:
pdf = './data/2022_Apple_ESG_Report.pdf'
 # Convert the contents (binary string) to bytes




def on_upload(change):
    if change['type'] == 'change' and change['name'] == 'value':
        uploaded_file = change['new']
        file_name, content_dict = next(iter(uploaded_file.items()))

        # Access the content of the uploaded file
        content = content_dict['content']
        decoded_pdf = base64.b64decode(content)

        # Use PyMuPDF (fitz) to extract the text from the file-like object
        text = ""
        with fitz.open(stream=io.BytesIO(decoded_pdf), filetype="pdf") as pdf_document:
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                text += page.get_text()

        # Perform desired operations with the extracted text
        print(f"File Name: {file_name}")
        print("Extracted Text:")
        print(text)

file_upload = widgets.FileUpload()
file_upload.observe(on_upload)
display(file_upload)

FileUpload(value=(), description='Upload')

In [2]:
import numpy as np
import pandas as pd
random_sentences = [
    "This is a random sentence.",
    "Here is another random sentence.",
    "These strings are in Python format.",
    "They can be used in your code.",
    "This sentence has a bullet point at the end.• It splits into two sentences.",
    "We are generating random strings.",
    "12345 is a number.",
    "The quick brown fox jumps over the lazy dog.",
    "These sentences contain special characters: $%^&*()",
    "Remember to use correct punctuation!",
]

df = pd.DataFrame(random_sentences)

In [3]:
df

Unnamed: 0,0
0,This is a random sentence.
1,Here is another random sentence.
2,These strings are in Python format.
3,They can be used in your code.
4,This sentence has a bullet point at the end.• ...
5,We are generating random strings.
6,12345 is a number.
7,The quick brown fox jumps over the lazy dog.
8,These sentences contain special characters: $%...
9,Remember to use correct punctuation!


In [4]:
def split_cell_at_bullets(df: pd.DataFrame) -> pd.DataFrame:
    list_of_strings = list(df.iloc[:,0])
    bullet_point_character = '•'  # Update this with the desired bullet point character
    new_list_of_sentences = []
    for sentence in list_of_strings:
        if bullet_point_character in sentence:
            sentence_split = sentence.split(bullet_point_character)
            for new_sentence in sentence_split:
                new_list_of_sentences.append(new_sentence)
        else:
            new_list_of_sentences.append(sentence)
    return new_list_of_sentences

In [5]:
df

Unnamed: 0,0
0,This is a random sentence.
1,Here is another random sentence.
2,These strings are in Python format.
3,They can be used in your code.
4,This sentence has a bullet point at the end.• ...
5,We are generating random strings.
6,12345 is a number.
7,The quick brown fox jumps over the lazy dog.
8,These sentences contain special characters: $%...
9,Remember to use correct punctuation!


In [39]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")

model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

#list of 26 ESG metrics which ESGBert can classify a text into. 
topic_list = ['Business_Ethics',
 'Data_Security',
 'Access_And_Affordability',
 'Business_Model_Resilience',
 'Competitive_Behavior',
 'Critical_Incident_Risk_Management',
 'Customer_Welfare',
 'Director_Removal',
 'Employee_Engagement_Inclusion_And_Diversity',
 'Employee_Health_And_Safety',
 'Human_Rights_And_Community_Relations',
 'Labor_Practices',
 'Management_Of_Legal_And_Regulatory_Framework',
 'Physical_Impacts_Of_Climate_Change',
 'Product_Quality_And_Safety',
 'Product_Design_And_Lifecycle_Management',
 'Selling_Practices_And_Product_Labeling',
 'Supply_Chain_Management',
 'Systemic_Risk_Management',
 'Waste_And_Hazardous_Materials_Management',
 'Water_And_Wastewater_Management',
 'Air_Quality',
 'Customer_Privacy',
 'Ecological_Impacts',
 'Energy_Management',
 'GHG_Emissions']

def score_sentences(list_of_sentences: list) -> dict:
    # the below dictionary is the outer one, where the keys are the sentences, and the values are the inner dictionary (nested)
    topic_probs_outer_dict = {}
    for sentence in list_of_sentences:
        topic_probs_dictionary = {}
        # Get topic probabilities for each document
        encoded_docs = tokenizer.batch_encode_plus([sentence], padding=True, truncation=True, return_tensors='pt')
        outputs = model(encoded_docs['input_ids'], attention_mask=encoded_docs['attention_mask'])
        probs = outputs[0].softmax(dim=1)
        probability_list = list(probs[0].detach().numpy())
        for topic, probability in zip(topic_list, probability_list):
            topic_probs_dictionary[topic] = probability
            topic_probs_outer_dict[sentence] = topic_probs_dictionary
    return topic_probs_outer_dict



In [40]:
string_test = ["Our Full Material Disclosure (FMD) program maps the chemicals used in our products — an effort that includes tens of thousands of parts and assemblies. Then we look at how our products are manufactured. Our Chemical Safety Disclosure (CSD) program engages with supply chain partners to get the most recent information on which materials are in use. This information includes the volume of materials being consumed and how they’re being applied, stored, and handled — as well as the steps being taken to protect employees. More than 1000 supplier facilities have shared their chemical inventories as well as storage and safety protocols as part of our CSD program. Through this process, we’ve identified more than 17,000 chemicals and applications. These efforts contribute to a safer work environment for people across our supply chain." \
               ,"Smarter chemistry The well-being of our employees, customers, people in our supply chain, and the planet is a priority for Apple, which is why we’re committed to using safer materials to create safer products. This commitment requires diligent work — to build a comprehensive picture of chemicals across our supply chain, to insist on rigorous chemical management processes, to promote adoption of safer chemical alternatives, and to innovate through design smarter approaches to making our products. Using safer chemistry in our products also enables recycling and material recovery, so that our products can be the raw materials for the next generation."]
dic = score_sentences(string_test)

In [51]:
nested_dict = score_sentences(string_test)
df = pd.DataFrame.from_dict(nested_dict,  orient = 'index').reset_index().rename(columns={'index':'sentence'})

In [52]:
df

Unnamed: 0,sentence,Business_Ethics,Data_Security,Access_And_Affordability,Business_Model_Resilience,Competitive_Behavior,Critical_Incident_Risk_Management,Customer_Welfare,Director_Removal,Employee_Engagement_Inclusion_And_Diversity,...,Selling_Practices_And_Product_Labeling,Supply_Chain_Management,Systemic_Risk_Management,Waste_And_Hazardous_Materials_Management,Water_And_Wastewater_Management,Air_Quality,Customer_Privacy,Ecological_Impacts,Energy_Management,GHG_Emissions
0,Our Full Material Disclosure (FMD) program map...,0.003879,0.005011,0.008095,0.004541,0.004227,0.002579,0.016142,0.005656,0.004234,...,0.027161,0.026706,0.004515,0.023534,0.001835,0.004354,0.005801,0.004739,0.005854,0.002524
1,Smarter chemistry The well-being of our employ...,0.000781,0.000912,0.001788,0.001985,0.000639,0.001094,0.005809,0.002229,0.001186,...,0.007055,0.003452,0.00061,0.004753,0.001036,0.000798,0.001884,0.00245,0.002434,0.00077


In [61]:
user_selection = 'Business_Ethics'

max_row = df[user_selection].argmax()
sentence = df.loc[max_row,'sentence']
print(sentence)

Our Full Material Disclosure (FMD) program maps the chemicals used in our products — an effort that includes tens of thousands of parts and assemblies. Then we look at how our products are manufactured. Our Chemical Safety Disclosure (CSD) program engages with supply chain partners to get the most recent information on which materials are in use. This information includes the volume of materials being consumed and how they’re being applied, stored, and handled — as well as the steps being taken to protect employees. More than 1000 supplier facilities have shared their chemical inventories as well as storage and safety protocols as part of our CSD program. Through this process, we’ve identified more than 17,000 chemicals and applications. These efforts contribute to a safer work environment for people across our supply chain.


In [62]:
list = ['hi my name', 'is','henry','and']
import random 
random.sample(list,2)

['is', 'hi my name']

In [15]:
import pdftotext
import io

pdf_loc = '../../data/2022_Apple_ESG_Report.pdf'
contents = []
with open(pdf_loc, "rb") as f:
    pdf = pdftotext.PDF(f)

for page in pdf:
    contents.append(page)


In [30]:
len(' '.join(contents).split('\n\n')[0])

62

In [33]:
' '.join(contents).split('\n\n')[0].split(' ')

['Environmental\nSocial\nGovernance\nReport\nApple’s', '2022', 'ESG', 'Report']

In [40]:
new_li = [i for i in ((' '.join(contents).split('\n\n'))) if len(i.split(' '))>10]
new_li

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing',
 '59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation',
 'This report contains forward-looking statements and actual results may differ. Numbers and percentages in this report include estimates or approximations and may be based on assumptions. For more information, see “About the report.”',
 'At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives. It’s why we work every\nday to make our technology an even greater force for good.\nToday, our teams around the world infuse Apple’s deeply held values into everything we make. That work can\ntake many forms. But whether we

In [48]:
print(spacy.util.get_data_path())



/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data


In [50]:
import spacy

print(spacy.util.get_data_path())


/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data


In [51]:
import spacy

nlp = spacy.load("/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data")


OSError: [E053] Could not read meta.json from /Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data/meta.json

In [52]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Download the Punkt tokenizer models (only once)

text = "Your text with multiple sentences. Each sentence will be split."
sentences = sent_tokenize(text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/henryparemain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
sentences

['Your text with multiple sentences.', 'Each sentence will be split.']

In [55]:
test_sentences = ' '.join(new_li)

In [56]:
sentences = sent_tokenize(test_sentences)

In [57]:
sentences

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing 59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation This report contains forward-looking statements and actual results may differ.',
 'Numbers and percentages in this report include estimates or approximations and may be based on assumptions.',
 'For more information, see “About the report.” At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives.',
 'It’s why we work every\nday to make our technology an even greater force for good.',
 'Today, our teams around the world infuse Apple’s deeply held values into everything we make.',
 'That work can\ntake many forms.',
 'But

In [64]:
new_li

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing',
 '59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation',
 'This report contains forward-looking statements and actual results may differ. Numbers and percentages in this report include estimates or approximations and may be based on assumptions. For more information, see “About the report.”',
 'At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives. It’s why we work every\nday to make our technology an even greater force for good.\nToday, our teams around the world infuse Apple’s deeply held values into everything we make. That work can\ntake many forms. But whether we

In [62]:
test = [sentence.replace('\n', ' ') for sentence in sentences]
new = []
for i in test:
    if '\t' in i:
        new.append(i)

In [63]:
new

['30\t\x07Workplace practices and policies 33\t\x07Health and safety at Apple Suppliers 37\t\x07Our approach 40\t\x07Labor and human rights in the supply chain 43\t\x07Health, safety, and wellness 44\t\x07Responsible materials sourcing 59 Our approach 60\t\x07Racial Equity and Justice Initiative 62 Education 64\t\x07Affordable housing initiative 65 Corporate donations 67 Employee giving 68 Job creation This report contains forward-looking statements and actual results may differ.',
 'Read the Apple Human Rights Policy \t\x07Our supplier requirements contain strict standards for responsible labor recruitment, and apply to all suppliers, protecting workers globally.',
 '\x07 ead the Apple Supplier Code of Conduct R and Supplier Responsibility Standards \t\x07To address forced labor risks at its roots, we know that our work has to begin before people enter our supply chain.',
 'Read our Annual Progress Report \t\x07This disclosure is a specialized filing that focuses specifically on our e

In [68]:
experi = '-––> Continue reading on page 38 Expanded our investment in\neducational opportunities\nthrough our Supplier Employee\nDevelopment Fund\nWe announced our Supplier Employee\nDevelopment Fund, which will expand our\nbest-in-class labor programs in our supply\nchain and establish a global Education Hub\nto scale the expansion of the technical and\nprofessional skills necessary for the jobs of\ntoday and tomorrow in our supply chain and\nsurrounding communities.'
experi = experi.replace('\n',' ')

In [69]:
test = sent_tokenize(experi)

In [70]:
test

['-––> Continue reading on page 38 Expanded our investment in educational opportunities through our Supplier Employee Development Fund We announced our Supplier Employee Development Fund, which will expand our best-in-class labor programs in our supply chain and establish a global Education Hub to scale the expansion of the technical and professional skills necessary for the jobs of today and tomorrow in our supply chain and surrounding communities.']

In [71]:
experi = '-––> Continue reading on page 38 Expanded our investment in\neducational opportunities\nthrough our Supplier Employee\nDevelopment Fund\nWe announced our Supplier Employee\nDevelopment Fund, which will expand our\nbest-in-class labor programs in our supply\nchain and establish a global Education Hub\nto scale the expansion of the technical and\nprofessional skills necessary for the jobs of\ntoday and tomorrow in our supply chain and\nsurrounding communities.'
print(experi)

-––> Continue reading on page 38 Expanded our investment in
educational opportunities
through our Supplier Employee
Development Fund
We announced our Supplier Employee
Development Fund, which will expand our
best-in-class labor programs in our supply
chain and establish a global Education Hub
to scale the expansion of the technical and
professional skills necessary for the jobs of
today and tomorrow in our supply chain and
surrounding communities.


Patterns to look out for and remove:

* number of words between \n characters (4 or less words between \n characters and delete)
* non 

In [72]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

In [76]:
sentences = [
    "Honeywell has spent more than $4 billion over the last 18 years to remediate and restore approximately 3,000 acres to productive community use including roughly 2,800 acres of biodiverse habitat which is extensively monitored.",
    "These biodiverse acres include preserved, restored, enhanced, and created wetlands; water and land habitat restoration, enhancement, creation, and preservation; and creation of green spaces with native ecosystems.",
    "The river has now become an environmental, economic, and community resource which includes nine acres of habitat restoration with over 143,000 native plantings.",
    "More than 285 wildlife species are now calling these areas home, and more than 120 unique bird species have been identified in and around Onondaga Lake.",
    "About 1,800 acres of wetlands have already been restored and preserved and about 1.1 million native plants have been planted.",
    "The project received resounding support from both the federal and Ohio Environmental Protection Agencies, and local stakeholders, serving as an example of green remediation.",
    "These historic operations mostly predate the Clean Water Act, the Clean Air Act, Superfund regulations, the Resource Conservation and Recovery Act, and/or the U.S. Environmental Protection Agency and are not a reflection of Honeywell’s current operations and processes.",
    "Additional piloting with two upstream operators – one in the Bakken Basin in North Dakota and the other in the Delaware Basin in Texas – is underway.",
    "Buffalo River, New York: Honeywell served as the private sector lead to restore the “functionally dead” Buffalo River through a unique public-private partnership.",
    "acres remediated and restored as valuable community assets Honeywell’s Remediation and Redevelopment Group (RRG) manages every project and is led by the Global Remediation Vice President, who reports to the Chief Sustainability Officer, a former Assistant Commissioner of New Jersey’s Department of Environmental Protection."
]


In [77]:
def create_list_of_google_searches(string_of_sentences: list(str)) -> list(str):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words("english"))
    tokenized_words = [word for sentence in sentences for word in word_tokenize(sentence.lower()) if word.isalpha() and word not in stop_words]

    # Calculate word frequencies
    freq_dist = FreqDist(tokenized_words)

    # Extract top keywords
    top_keywords = [word for word, freq in freq_dist.most_common(10)]

    # Combine keywords to create search phrases
    search_phrases = [' '.join(top_keywords[i:i+2]) for i in range(0, len(top_keywords), 2)]
    
    return search_phrases

['acres honeywell', 'environmental community', 'habitat restored', 'native river', 'protection remediation']


In [78]:
' '.join(search_phrases)

'acres honeywell environmental community habitat restored native river protection remediation'

In [164]:
from pygooglenews import GoogleNews


def get_titles(search):
    gn = GoogleNews(country='UK')
    search = gn.search(search)
    news_item = search['entries']
    for item in news_item:
        print(item.link)

get_titles('brexit')

https://news.google.com/rss/articles/CBMiYmh0dHBzOi8vd3d3LnRoZWd1YXJkaWFuLmNvbS9wb2xpdGljcy8yMDIzL2F1Zy8xMy9icmV4aXQtYnJhaW4tc2NpZW5jZS1wc3ljaG9sb2d5LWRpdmlkZWQtdWstZGViYXRl0gFiaHR0cHM6Ly9hbXAudGhlZ3VhcmRpYW4uY29tL3BvbGl0aWNzLzIwMjMvYXVnLzEzL2JyZXhpdC1icmFpbi1zY2llbmNlLXBzeWNob2xvZ3ktZGl2aWRlZC11ay1kZWJhdGU?oc=5
https://news.google.com/rss/articles/CBMiamh0dHBzOi8vd3d3Lm5ld3N0YXRlc21hbi5jb20vdGhlLXdlZWtlbmQtaW50ZXJ2aWV3LzIwMjMvMDgvYWRhbS1wb3Nlbi1pbnRlcnZpZXctYnJleGl0LXRyYWRlLXdhci1pbmZsYXRpb27SAQA?oc=5
https://news.google.com/rss/articles/CBMia2h0dHBzOi8vd3d3LnRoZWdyb2Nlci5jby51ay9icmV4aXQvYnJleGl0LWtleS1jYXVzZS1vZi1leHBvcnRzLWRlY2xpbmUtZ292ZXJubWVudC1kYXRhLXNob3dzLzY4MjA0MC5hcnRpY2xl0gEA?oc=5
https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LnNwZWN0YXRvci5jby51ay9hcnRpY2xlL3RoZS1kcm9wLWluLWxhbmd1YWdlLXN0dWRlbnRzLWhhcy1ub3RoaW5nLXRvLWRvLXdpdGgtYnJleGl0L9IBAA?oc=5
https://news.google.com/rss/articles/CBMiZmh0dHBzOi8vd3d3LnRoZW5ld2V1cm9wZWFuLmNvLnVrL2JyZXhpdC1icml0YWlucy1nb3Qtbm8

In [162]:
from pygooglenews import GoogleNews

gn = GoogleNews()
# search for the best matching articles that mention MSFT and 
# do not mention AAPL (over the past 6 month
# s = gn.search('allintext:boeing', when = '12m')
s = gn.search('intitle:honeywell', from_='2022-01-01', to_='2023-01-01')

# s = gn.search('apple':str, helper = True, when = None, from_ = None, to_ = None, proxies=None, scraping_bee=None)

  
list_of_articles = [(i['link']) for i in s['entries']]              
s

{'feed': {'generator_detail': {'name': 'NFE/5.0'},
  'generator': 'NFE/5.0',
  'title': '"intitle:honeywell after:2022-01-01 before:2023-01-01" - Google News',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': '"intitle:honeywell after:2022-01-01 before:2023-01-01" - Google News'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://news.google.com/search?q=intitle:honeywell+after:2022-01-01+before:2023-01-01&ceid=US:en&hl=en-US&gl=US'}],
  'link': 'https://news.google.com/search?q=intitle:honeywell+after:2022-01-01+before:2023-01-01&ceid=US:en&hl=en-US&gl=US',
  'language': 'en-US',
  'publisher': 'news-webmaster@google.com',
  'publisher_detail': {'email': 'news-webmaster@google.com'},
  'rights': '2023 Google Inc.',
  'rights_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': '2023 Google Inc.'},
  'updated': 'Sun, 13 Aug 2023 20:08:43 GMT',
  'updated_parsed': time.struct_time(tm_year=202

In [160]:
s['entries']

[{'title': 'Honeywell UOP to Pay Over $160 Million to Resolve Foreign Bribery ... - Department of Justice',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': 'Honeywell UOP to Pay Over $160 Million to Resolve Foreign Bribery ... - Department of Justice'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://news.google.com/rss/articles/CBMidmh0dHBzOi8vd3d3Lmp1c3RpY2UuZ292L29wYS9wci9ob25leXdlbGwtdW9wLXBheS1vdmVyLTE2MC1taWxsaW9uLXJlc29sdmUtZm9yZWlnbi1icmliZXJ5LWludmVzdGlnYXRpb25zLXVzLWFuZC1icmF6aWzSAQA?oc=5'}],
  'link': 'https://news.google.com/rss/articles/CBMidmh0dHBzOi8vd3d3Lmp1c3RpY2UuZ292L29wYS9wci9ob25leXdlbGwtdW9wLXBheS1vdmVyLTE2MC1taWxsaW9uLXJlc29sdmUtZm9yZWlnbi1icmliZXJ5LWludmVzdGlnYXRpb25zLXVzLWFuZC1icmF6aWzSAQA?oc=5',
  'id': 'CBMidmh0dHBzOi8vd3d3Lmp1c3RpY2UuZ292L29wYS9wci9ob25leXdlbGwtdW9wLXBheS1vdmVyLTE2MC1taWxsaW9uLXJlc29sdmUtZm9yZWlnbi1icmliZXJ5LWludmVzdGlnYXRpb25zLXVzLWFuZC1icmF6aWzSAQA',
  'guidislink': Fa

In [137]:
test = [
    {
      "label": "Competitive_Behavior",
      "score": 0.2434668093919754
    },
    {
      "label": "Customer_Welfare",
      "score": 0.1371322125196457
    },
    {
      "label": "Customer_Privacy",
      "score": 0.07673874497413635
    },
    {
      "label": "Employee_Engagement_Inclusion_And_Diversity",
      "score": 0.05847414955496788
    },
    {
      "label": "Selling_Practices_And_Product_Labeling",
      "score": 0.04809295013546944
    },
    {
      "label": "Business_Ethics",
      "score": 0.04366432875394821
    },
    {
      "label": "Waste_And_Hazardous_Materials_Management",
      "score": 0.041438959538936615
    },
    {
      "label": "Ecological_Impacts",
      "score": 0.035682909190654755
    },
    {
      "label": "Business_Model_Resilience",
      "score": 0.03285621106624603
    },
    {
      "label": "Systemic_Risk_Management",
      "score": 0.03252149373292923
    },
    {
      "label": "Data_Security",
      "score": 0.03078947402536869
    },
    {
      "label": "Water_And_Wastewater_Management",
      "score": 0.026954304426908493
    },
    {
      "label": "Access_And_Affordability",
      "score": 0.022618690505623817
    },
    {
      "label": "GHG_Emissions",
      "score": 0.02203524112701416
    },
    {
      "label": "Management_Of_Legal_And_Regulatory_Framework",
      "score": 0.02011912129819393
    },
    {
      "label": "Labor_Practices",
      "score": 0.019137129187583923
    },
    {
      "label": "Director_Removal",
      "score": 0.01598607935011387
    },
    {
      "label": "Human_Rights_And_Community_Relations",
      "score": 0.01514087151736021
    },
    {
      "label": "Supply_Chain_Management",
      "score": 0.01306261494755745
    },
    {
      "label": "Product_Design_And_Lifecycle_Management",
      "score": 0.012975852005183697
    },
    {
      "label": "Energy_Management",
      "score": 0.012334146536886692
    },
    {
      "label": "Product_Quality_And_Safety",
      "score": 0.009592123329639435
    },
    {
      "label": "Air_Quality",
      "score": 0.0094348369166255
    },
    {
      "label": "Critical_Incident_Risk_Management",
      "score": 0.00833923276513815
    },
    {
      "label": "Employee_Health_And_Safety",
      "score": 0.007416508160531521
    },
    {
      "label": "Physical_Impacts_Of_Climate_Change",
      "score": 0.00399497477337718
    }
  ]


li = [i['score'] for i in test]
sum(li)

0.9999999697320163

In [150]:
# Function to fetch article text from a given URL
import requests
from bs4 import BeautifulSoup



def remove_after_word(string, word):
    words = string.split()
    try:
        index = words.index(word) - 1
        result = ' '.join(words[:index+1])
    except ValueError:
        result = string
    return result


def remove_before_word(string, word):
    words = string.split()
    try:
        index = words.index(word) 
        result = ' '.join(words[index+1:])
    except ValueError:
        result = string
    return result


def fetch_article_text(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)  
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <p> tags within the article content
    paragraph_tags = soup.find_all('p')
    # Extract the text from each <p> tag and join them together
    article_text = ' '.join([p.get_text().strip() for p in paragraph_tags])

    #removing noise from end of string 
    # text = article_text
    # word_to_remove_after = 'Source:'
    # word_to_remove_before = 'want'
    # new_text = remove_after_word(text, word_to_remove_after)
    # new_text = remove_before_word(new_text, word_to_remove_before)


    return article_text

# Apply the fetch_article_text function to each URL in the 'url' column and store the result in a new 'article_text' column
# df['article_text'] = df['url'].apply(fetch_article_text)

In [198]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get('https://www.esgtoday.com/?s=honeywell', headers=headers)  
soup = BeautifulSoup(response.text, 'html.parser')

In [200]:
# soup = soup.find('main')
# article = soup.find('article')



article_divs = soup.find_all(class_="post-content")
hrefs = []

for div in article_divs:
    anchor_tag = div.find('a')  # Find the anchor tag within the div
    if anchor_tag:
        hrefs.append(anchor_tag.get('href'))

print(hrefs)

['https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/energy-transition/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/category/esg-news/companies/', 'https://www.esgtoday.com/esg-today-week-in-review-135/', 'https://www.esgtoday.com/u-s-grants-1-2-billion-to-develop-oxy-climeworks-carbon-removal-projects/', 'https://www.esgtoday.com/canada-releases-clean-electricity-regulation-targeting-a-net-zero-grid-by-2035/', 'https://www.esgtoday.com/fmc-commits-30-million-to-support-global-zero-hunger-goal/', 'https://www.esgtoday.com/cleantech-startup-matter-raises-10-milli

In [209]:
len('https://www.esgtoday.com/category/esg-news/energy-transition/')


61

In [210]:
[i for i in hrefs if len(i)>65]

['https://www.esgtoday.com/u-s-grants-1-2-billion-to-develop-oxy-climeworks-carbon-removal-projects/',
 'https://www.esgtoday.com/canada-releases-clean-electricity-regulation-targeting-a-net-zero-grid-by-2035/',
 'https://www.esgtoday.com/fmc-commits-30-million-to-support-global-zero-hunger-goal/',
 'https://www.esgtoday.com/cleantech-startup-matter-raises-10-million-to-apply-microplastics-removal-tech-to-industry/',
 'https://www.esgtoday.com/blackstone-raises-7-billion-for-largest-ever-private-credit-energy-transition-fund/',
 'https://www.esgtoday.com/sustainable-bond-proceeds-disproportionately-allocated-to-climate-mitigation-over-adaptation-fitch/',
 'https://www.esgtoday.com/nuveen-launches-525-million-sustainable-commercial-real-estate-lending-fund/',
 'https://www.esgtoday.com/barclays-appoints-james-edmonds-as-global-head-of-sustainable-project-finance/',
 'https://www.esgtoday.com/u-s-grants-1-2-billion-to-develop-oxy-climeworks-carbon-removal-projects/',
 'https://www.esgtod

In [152]:
fetch_article_text(list_of_articles[1])

'We use cookies and data to If you choose to “Accept all,” we will also use cookies and data to If you choose to “Reject all,” we will not use cookies for these additional purposes. Non-personalized content is influenced by things like the content you’re currently viewing, activity in your active Search session, and your location. Non-personalized ads are influenced by the content you’re currently viewing and your general location. Personalized content and ads can also include more relevant results, recommendations, and tailored ads based on past activity from this browser, like previous Google searches. We also use cookies and data to tailor the experience to be age-appropriate, if relevant. Select “More options” to see additional information, including details about managing your privacy settings. You can also visit g.co/privacytools at any time.'

In [155]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get('https://news.google.com/rss/articles/CBMiRWh0dHBzOi8vd3d3LmZvb2wuY29tL2ludmVzdGluZy8yMDIyLzExLzEwL2JldHRlci1idXktM20tb3ItaG9uZXl3ZWxsL9IBAA?oc=5', headers=headers)  
soup = BeautifulSoup(response.text, 'html.parser')

In [159]:
response.content



In [161]:


gn = GoogleNews()

# it's a fake API key, do not try to use it
gn.top_news(scraping_bee = 'VDOGU7QSK155K66M9UCYRH6J0HYTJ2ZA6U8NH880587JZ4158AUCY0FAX7PZM55LIUHJAYAIIFLR8L9P')

Exception: ScrapingBee status_code: 400 {"errors":{"query":{"custom_google":["If you wish to scrape Google, use the custom_google=True parameter. ! Each requests                     will costs 20 credits !"]}}}


In [180]:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get(url='https://www.esgtoday.com/?s=honeywell')

In [185]:
r.text

'<!DOCTYPE html>\n<html lang="en-US">\n<head>\n        <meta charset="UTF-8">\n        <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">\n        <style id="tb_inline_styles" data-no-optimize="1">.tb_animation_on{overflow-x:hidden}.themify_builder .wow{visibility:hidden;animation-fill-mode:both}[data-tf-animation]{will-change:transform,opacity,visibility}.themify_builder .tf_lax_done{transition-duration:.8s;transition-timing-function:cubic-bezier(.165,.84,.44,1)}[data-sticky-active].tb_sticky_scroll_active{z-index:1}[data-sticky-active].tb_sticky_scroll_active .hide-on-stick{display:none}@media(min-width:1046px){.hide-desktop{width:0!important;height:0!important;padding:0!important;visibility:hidden!important;margin:0!important;display:table-column!important;background:0!important}}@media(min-width:769px) and (max-width:1045px){.hide-tablet_landscape{width:0!important;height:0!important;padding:0!important;visibility:hidden!important;margin:0!importa

In [182]:
for title in r.html.find('title'):
    print(title.text)

You searched for honeywell - ESG Today


In [176]:
from gnews import GNews

google_news = GNews()
pakistan_news = google_news.get_news('Pakistan')
print(pakistan_news[0])

{'title': 'Pakistan security forces kill 2 after attack on Chinese convoy - Al Jazeera English', 'description': 'Pakistan security forces kill 2 after attack on Chinese convoy  Al Jazeera EnglishPakistan Militants Attack Convoy of Chinese Engineers, AFP Says  BloombergPakistani militants attack convoy of Chinese engineers  CNN', 'published date': 'Sun, 13 Aug 2023 14:15:00 GMT', 'url': 'https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMic2h0dHBzOi8vd3d3LmFsamF6ZWVyYS5jb20vbmV3cy8yMDIzLzgvMTMvcGFraXN0YW4tc2VjdXJpdHktZm9yY2VzLWtpbGwtdHdvLWFmdGVyLWF0dGFjay1vbi1jaGluZXNlLWNvbnZveS1pbi1nd2FkYXLSAXdodHRwczovL3d3dy5hbGphemVlcmEuY29tL2FtcC9uZXdzLzIwMjMvOC8xMy9wYWtpc3Rhbi1zZWN1cml0eS1mb3JjZXMta2lsbC10d28tYWZ0ZXItYXR0YWNrLW9uLWNoaW5lc2UtY29udm95LWluLWd3YWRhcg?oc%3D5&gl=GB&m=0&pc=n&cm=2&hl=en-US&src=1', 'publisher': {'href': 'https://www.aljazeera.com', 'title': 'Al Jazeera English'}}
