In [5]:
from dash import Dash, html, dcc
from src.components import ids
import base64
from dash.dependencies import Input, Output, State
import pdftotext
import io 
import PyPDF2
from PyPDF2 import PdfReader
import fitz  # Import PyMuPDF as fitz

import ipywidgets as widgets
from IPython.display import display

In [15]:
pdf = './data/2022_Apple_ESG_Report.pdf'
 # Convert the contents (binary string) to bytes




def on_upload(change):
    if change['type'] == 'change' and change['name'] == 'value':
        uploaded_file = change['new']
        file_name, content_dict = next(iter(uploaded_file.items()))

        # Access the content of the uploaded file
        content = content_dict['content']
        decoded_pdf = base64.b64decode(content)

        # Use PyMuPDF (fitz) to extract the text from the file-like object
        text = ""
        with fitz.open(stream=io.BytesIO(decoded_pdf), filetype="pdf") as pdf_document:
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                text += page.get_text()

        # Perform desired operations with the extracted text
        print(f"File Name: {file_name}")
        print("Extracted Text:")
        print(text)

file_upload = widgets.FileUpload()
file_upload.observe(on_upload)
display(file_upload)

FileUpload(value=(), description='Upload')

In [2]:
import numpy as np
import pandas as pd
random_sentences = [
    "This is a random sentence.",
    "Here is another random sentence.",
    "These strings are in Python format.",
    "They can be used in your code.",
    "This sentence has a bullet point at the end.• It splits into two sentences.",
    "We are generating random strings.",
    "12345 is a number.",
    "The quick brown fox jumps over the lazy dog.",
    "These sentences contain special characters: $%^&*()",
    "Remember to use correct punctuation!",
]

df = pd.DataFrame(random_sentences)

In [3]:
df

Unnamed: 0,0
0,This is a random sentence.
1,Here is another random sentence.
2,These strings are in Python format.
3,They can be used in your code.
4,This sentence has a bullet point at the end.• ...
5,We are generating random strings.
6,12345 is a number.
7,The quick brown fox jumps over the lazy dog.
8,These sentences contain special characters: $%...
9,Remember to use correct punctuation!


In [4]:
def split_cell_at_bullets(df: pd.DataFrame) -> pd.DataFrame:
    list_of_strings = list(df.iloc[:,0])
    bullet_point_character = '•'  # Update this with the desired bullet point character
    new_list_of_sentences = []
    for sentence in list_of_strings:
        if bullet_point_character in sentence:
            sentence_split = sentence.split(bullet_point_character)
            for new_sentence in sentence_split:
                new_list_of_sentences.append(new_sentence)
        else:
            new_list_of_sentences.append(sentence)
    return new_list_of_sentences

In [5]:
df

Unnamed: 0,0
0,This is a random sentence.
1,Here is another random sentence.
2,These strings are in Python format.
3,They can be used in your code.
4,This sentence has a bullet point at the end.• ...
5,We are generating random strings.
6,12345 is a number.
7,The quick brown fox jumps over the lazy dog.
8,These sentences contain special characters: $%...
9,Remember to use correct punctuation!


In [39]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")

model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

#list of 26 ESG metrics which ESGBert can classify a text into. 
topic_list = ['Business_Ethics',
 'Data_Security',
 'Access_And_Affordability',
 'Business_Model_Resilience',
 'Competitive_Behavior',
 'Critical_Incident_Risk_Management',
 'Customer_Welfare',
 'Director_Removal',
 'Employee_Engagement_Inclusion_And_Diversity',
 'Employee_Health_And_Safety',
 'Human_Rights_And_Community_Relations',
 'Labor_Practices',
 'Management_Of_Legal_And_Regulatory_Framework',
 'Physical_Impacts_Of_Climate_Change',
 'Product_Quality_And_Safety',
 'Product_Design_And_Lifecycle_Management',
 'Selling_Practices_And_Product_Labeling',
 'Supply_Chain_Management',
 'Systemic_Risk_Management',
 'Waste_And_Hazardous_Materials_Management',
 'Water_And_Wastewater_Management',
 'Air_Quality',
 'Customer_Privacy',
 'Ecological_Impacts',
 'Energy_Management',
 'GHG_Emissions']

def score_sentences(list_of_sentences: list) -> dict:
    # the below dictionary is the outer one, where the keys are the sentences, and the values are the inner dictionary (nested)
    topic_probs_outer_dict = {}
    for sentence in list_of_sentences:
        topic_probs_dictionary = {}
        # Get topic probabilities for each document
        encoded_docs = tokenizer.batch_encode_plus([sentence], padding=True, truncation=True, return_tensors='pt')
        outputs = model(encoded_docs['input_ids'], attention_mask=encoded_docs['attention_mask'])
        probs = outputs[0].softmax(dim=1)
        probability_list = list(probs[0].detach().numpy())
        for topic, probability in zip(topic_list, probability_list):
            topic_probs_dictionary[topic] = probability
            topic_probs_outer_dict[sentence] = topic_probs_dictionary
    return topic_probs_outer_dict



In [40]:
string_test = ["Our Full Material Disclosure (FMD) program maps the chemicals used in our products — an effort that includes tens of thousands of parts and assemblies. Then we look at how our products are manufactured. Our Chemical Safety Disclosure (CSD) program engages with supply chain partners to get the most recent information on which materials are in use. This information includes the volume of materials being consumed and how they’re being applied, stored, and handled — as well as the steps being taken to protect employees. More than 1000 supplier facilities have shared their chemical inventories as well as storage and safety protocols as part of our CSD program. Through this process, we’ve identified more than 17,000 chemicals and applications. These efforts contribute to a safer work environment for people across our supply chain." \
               ,"Smarter chemistry The well-being of our employees, customers, people in our supply chain, and the planet is a priority for Apple, which is why we’re committed to using safer materials to create safer products. This commitment requires diligent work — to build a comprehensive picture of chemicals across our supply chain, to insist on rigorous chemical management processes, to promote adoption of safer chemical alternatives, and to innovate through design smarter approaches to making our products. Using safer chemistry in our products also enables recycling and material recovery, so that our products can be the raw materials for the next generation."]
dic = score_sentences(string_test)

In [51]:
nested_dict = score_sentences(string_test)
df = pd.DataFrame.from_dict(nested_dict,  orient = 'index').reset_index().rename(columns={'index':'sentence'})

In [52]:
df

Unnamed: 0,sentence,Business_Ethics,Data_Security,Access_And_Affordability,Business_Model_Resilience,Competitive_Behavior,Critical_Incident_Risk_Management,Customer_Welfare,Director_Removal,Employee_Engagement_Inclusion_And_Diversity,...,Selling_Practices_And_Product_Labeling,Supply_Chain_Management,Systemic_Risk_Management,Waste_And_Hazardous_Materials_Management,Water_And_Wastewater_Management,Air_Quality,Customer_Privacy,Ecological_Impacts,Energy_Management,GHG_Emissions
0,Our Full Material Disclosure (FMD) program map...,0.003879,0.005011,0.008095,0.004541,0.004227,0.002579,0.016142,0.005656,0.004234,...,0.027161,0.026706,0.004515,0.023534,0.001835,0.004354,0.005801,0.004739,0.005854,0.002524
1,Smarter chemistry The well-being of our employ...,0.000781,0.000912,0.001788,0.001985,0.000639,0.001094,0.005809,0.002229,0.001186,...,0.007055,0.003452,0.00061,0.004753,0.001036,0.000798,0.001884,0.00245,0.002434,0.00077


In [61]:
user_selection = 'Business_Ethics'

max_row = df[user_selection].argmax()
sentence = df.loc[max_row,'sentence']
print(sentence)

Our Full Material Disclosure (FMD) program maps the chemicals used in our products — an effort that includes tens of thousands of parts and assemblies. Then we look at how our products are manufactured. Our Chemical Safety Disclosure (CSD) program engages with supply chain partners to get the most recent information on which materials are in use. This information includes the volume of materials being consumed and how they’re being applied, stored, and handled — as well as the steps being taken to protect employees. More than 1000 supplier facilities have shared their chemical inventories as well as storage and safety protocols as part of our CSD program. Through this process, we’ve identified more than 17,000 chemicals and applications. These efforts contribute to a safer work environment for people across our supply chain.


In [62]:
list = ['hi my name', 'is','henry','and']
import random 
random.sample(list,2)

['is', 'hi my name']

In [15]:
import pdftotext
import io

pdf_loc = '../../data/2022_Apple_ESG_Report.pdf'
contents = []
with open(pdf_loc, "rb") as f:
    pdf = pdftotext.PDF(f)

for page in pdf:
    contents.append(page)


In [30]:
len(' '.join(contents).split('\n\n')[0])

62

In [33]:
' '.join(contents).split('\n\n')[0].split(' ')

['Environmental\nSocial\nGovernance\nReport\nApple’s', '2022', 'ESG', 'Report']

In [40]:
new_li = [i for i in ((' '.join(contents).split('\n\n'))) if len(i.split(' '))>10]
new_li

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing',
 '59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation',
 'This report contains forward-looking statements and actual results may differ. Numbers and percentages in this report include estimates or approximations and may be based on assumptions. For more information, see “About the report.”',
 'At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives. It’s why we work every\nday to make our technology an even greater force for good.\nToday, our teams around the world infuse Apple’s deeply held values into everything we make. That work can\ntake many forms. But whether we

In [48]:
print(spacy.util.get_data_path())



/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data


In [50]:
import spacy

print(spacy.util.get_data_path())


/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data


In [51]:
import spacy

nlp = spacy.load("/Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data")


OSError: [E053] Could not read meta.json from /Users/henryparemain/opt/anaconda3/envs/Capstone/lib/python3.7/site-packages/spacy/data/meta.json

In [52]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Download the Punkt tokenizer models (only once)

text = "Your text with multiple sentences. Each sentence will be split."
sentences = sent_tokenize(text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/henryparemain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
sentences

['Your text with multiple sentences.', 'Each sentence will be split.']

In [55]:
test_sentences = ' '.join(new_li)

In [56]:
sentences = sent_tokenize(test_sentences)

In [57]:
sentences

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing 59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation This report contains forward-looking statements and actual results may differ.',
 'Numbers and percentages in this report include estimates or approximations and may be based on assumptions.',
 'For more information, see “About the report.” At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives.',
 'It’s why we work every\nday to make our technology an even greater force for good.',
 'Today, our teams around the world infuse Apple’s deeply held values into everything we make.',
 'That work can\ntake many forms.',
 'But

In [64]:
new_li

['30\t\x07Workplace practices\nand policies\n33\t\x07Health and safety\nat Apple\nSuppliers\n37\t\x07Our approach\n40\t\x07Labor and human rights\nin the supply chain\n43\t\x07Health, safety,\nand wellness\n44\t\x07Responsible materials\nsourcing',
 '59 Our approach\n60\t\x07Racial Equity\nand Justice Initiative\n62 Education\n64\t\x07Affordable housing\ninitiative\n65 Corporate donations\n67 Employee giving\n68 Job creation',
 'This report contains forward-looking statements and actual results may differ. Numbers and percentages in this report include estimates or approximations and may be based on assumptions. For more information, see “About the report.”',
 'At Apple, we believe the measure of any great innovation is the\npositive impact it has on people’s lives. It’s why we work every\nday to make our technology an even greater force for good.\nToday, our teams around the world infuse Apple’s deeply held values into everything we make. That work can\ntake many forms. But whether we

In [62]:
test = [sentence.replace('\n', ' ') for sentence in sentences]
new = []
for i in test:
    if '\t' in i:
        new.append(i)

In [63]:
new

['30\t\x07Workplace practices and policies 33\t\x07Health and safety at Apple Suppliers 37\t\x07Our approach 40\t\x07Labor and human rights in the supply chain 43\t\x07Health, safety, and wellness 44\t\x07Responsible materials sourcing 59 Our approach 60\t\x07Racial Equity and Justice Initiative 62 Education 64\t\x07Affordable housing initiative 65 Corporate donations 67 Employee giving 68 Job creation This report contains forward-looking statements and actual results may differ.',
 'Read the Apple Human Rights Policy \t\x07Our supplier requirements contain strict standards for responsible labor recruitment, and apply to all suppliers, protecting workers globally.',
 '\x07 ead the Apple Supplier Code of Conduct R and Supplier Responsibility Standards \t\x07To address forced labor risks at its roots, we know that our work has to begin before people enter our supply chain.',
 'Read our Annual Progress Report \t\x07This disclosure is a specialized filing that focuses specifically on our e

In [68]:
experi = '-––> Continue reading on page 38 Expanded our investment in\neducational opportunities\nthrough our Supplier Employee\nDevelopment Fund\nWe announced our Supplier Employee\nDevelopment Fund, which will expand our\nbest-in-class labor programs in our supply\nchain and establish a global Education Hub\nto scale the expansion of the technical and\nprofessional skills necessary for the jobs of\ntoday and tomorrow in our supply chain and\nsurrounding communities.'
experi = experi.replace('\n',' ')

In [69]:
test = sent_tokenize(experi)

In [70]:
test

['-––> Continue reading on page 38 Expanded our investment in educational opportunities through our Supplier Employee Development Fund We announced our Supplier Employee Development Fund, which will expand our best-in-class labor programs in our supply chain and establish a global Education Hub to scale the expansion of the technical and professional skills necessary for the jobs of today and tomorrow in our supply chain and surrounding communities.']

In [71]:
experi = '-––> Continue reading on page 38 Expanded our investment in\neducational opportunities\nthrough our Supplier Employee\nDevelopment Fund\nWe announced our Supplier Employee\nDevelopment Fund, which will expand our\nbest-in-class labor programs in our supply\nchain and establish a global Education Hub\nto scale the expansion of the technical and\nprofessional skills necessary for the jobs of\ntoday and tomorrow in our supply chain and\nsurrounding communities.'
print(experi)

-––> Continue reading on page 38 Expanded our investment in
educational opportunities
through our Supplier Employee
Development Fund
We announced our Supplier Employee
Development Fund, which will expand our
best-in-class labor programs in our supply
chain and establish a global Education Hub
to scale the expansion of the technical and
professional skills necessary for the jobs of
today and tomorrow in our supply chain and
surrounding communities.


Patterns to look out for and remove:

* number of words between \n characters (4 or less words between \n characters and delete)
* non 