# Sustainability Anlysis

# 1 Configuration

In [1]:
import os
import pandas as pd
import PyPDF2
import string
import unicodedata
from tqdm import tqdm

# 2 Convert Sustainability Reports to Text Files

In [2]:
def report_to_text(report):
    """Create a text string of pdf-file content"""
    pdfFileObj = open(f"Reports/{report}.pdf", "rb")
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    no_pages = len(pdfReader.pages)

    report_text = ""
    for i in range(0,no_pages):
        pageObj = pdfReader.pages[i]
        report_text += pageObj.extract_text()
    
    return report_text

In [3]:
file_names = os.listdir("Reports")
report_file_names = [f[:-4] for f in file_names if f[-4:] == ".pdf"]

for report in tqdm(report_file_names):
    report_text = report_to_text(report)
    print(report)
    f = open(f"Reports_text/{report}.txt", "a")
    f.write(report_text)
    f.close()

 50%|█████     | 1/2 [00:00<00:00,  1.12it/s]

lean_primer


100%|██████████| 2/2 [00:02<00:00,  1.43s/it]

Haglöfs





# 3 Create Sustainability Report Table 

In [4]:
# Create an empty table
report_table = pd.DataFrame(columns=["report_name", "content"])
# List text files in directory
file_names = os.listdir("Reports_text")
text_file_names = [f[:-4] for f in file_names if f[-4:] == ".txt"]
# Append report name and report text for each report
for text_file_name in text_file_names:
    # Read text from report text file
    text_file = open(f"Reports_text/{text_file_name}.txt", "r")
    content = text_file.read()
    content = content.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    # print(content)
    text_file.close()    
    # Append report name and text to table
    report_row = pd.DataFrame([[text_file_name, content]], columns=["report_name", "content"])
    report_table = pd.concat([report_table, report_row],  axis=0, ignore_index=True)
# Save table as csv file
report_table.to_csv("reports.csv", index=False)

In [5]:
report_table.head()

Unnamed: 0,report_name,content
0,lean_primer,1 www.leanprimer.com Copyright (c) Craig Larma...
1,Haglöfs,1 Haglöfs Sustainability Report Sustainabilit...


# 4 Read Loughran-McDonald MasterD ictionary

In [6]:
master_dictionary = pd.read_csv("Loughran-McDonald_MasterDictionary_1993-2021.csv")

In [7]:
postive_words = master_dictionary[master_dictionary.Positive != 0]["Word"].to_list()
postive_words[:20]

['ABLE',
 'ABUNDANCE',
 'ABUNDANT',
 'ACCLAIMED',
 'ACCOMPLISH',
 'ACCOMPLISHED',
 'ACCOMPLISHES',
 'ACCOMPLISHING',
 'ACCOMPLISHMENT',
 'ACCOMPLISHMENTS',
 'ACHIEVE',
 'ACHIEVED',
 'ACHIEVEMENT',
 'ACHIEVEMENTS',
 'ACHIEVES',
 'ACHIEVING',
 'ADEQUATELY',
 'ADVANCEMENT',
 'ADVANCEMENTS',
 'ADVANCES']

In [8]:
negative_words = master_dictionary[master_dictionary.Negative != 0]["Word"].to_list()
negative_words[-20:]
negative_words[:20]

['ABANDON',
 'ABANDONED',
 'ABANDONING',
 'ABANDONMENT',
 'ABANDONMENTS',
 'ABANDONS',
 'ABDICATED',
 'ABDICATES',
 'ABDICATING',
 'ABDICATION',
 'ABDICATIONS',
 'ABERRANT',
 'ABERRATION',
 'ABERRATIONAL',
 'ABERRATIONS',
 'ABETTING',
 'ABNORMAL',
 'ABNORMALITIES',
 'ABNORMALITY',
 'ABNORMALLY']

# 5 Count Words in Reports

In [9]:
report_table = pd.read_csv("reports.csv")
report_table_count = report_table.copy()

In [10]:
def count_total_words(text):
    """Counts the total number of words in parameter string named text"""
    return sum([i.strip(string.punctuation).isalpha() for i in text.split()])

In [11]:
def count_words_in_text_from_list(text, word_list):
    """Counts the number of words in word_list in text"""
    # Initialize an empty list to store the word counts
    word_counts = {}
    text = text.upper()
    
    # Iterate through each word in the list
    for word in word_list:
      # Count the occurrences of the word in the text using the count() method
      count = text.count(word)
      # Append the count to the word_counts list
      word_counts[word] = count
    
    return word_counts

In [12]:
report_table_count["word_count"] = report_table_count.content.map(count_total_words)
report_table_count["negative_word_count"] = report_table_count.content.map(lambda content: count_words_in_text_from_list(content, negative_words))
report_table_count["positive_word_count"] = report_table_count.content.map(lambda content: count_words_in_text_from_list(content, postive_words))
report_table_count["total_negative_word_count"] = report_table_count["negative_word_count"].map(lambda d: sum(d.values()))
report_table_count["total_positive_word_count"] = report_table_count["positive_word_count"].map(lambda d: sum(d.values()))

In [13]:
report_table_count = report_table_count.drop(columns="content")

In [14]:
report_table_count.to_csv("report_table_count.csv", index=False)

In [15]:
report_table_count.head()

Unnamed: 0,report_name,word_count,negative_word_count,positive_word_count,total_negative_word_count,total_positive_word_count
0,lean_primer,569251,"{'ABANDON': 45, 'ABANDONED': 0, 'ABANDONING': ...","{'ABLE': 1575, 'ABUNDANCE': 0, 'ABUNDANT': 0, ...",23130,19845
1,Haglöfs,367232,"{'ABANDON': 0, 'ABANDONED': 0, 'ABANDONING': 0...","{'ABLE': 2109, 'ABUNDANCE': 0, 'ABUNDANT': 0, ...",19285,10127


In [16]:
print(report_table_count.negative_word_count.map(lambda d: f"{max(d, key=d.get)}: {d[max(d, key=d.get)]}"))
print(report_table_count.positive_word_count.map(lambda d: f"{max(d, key=d.get)}: {d[max(d, key=d.get)]}"))
print(report_table_count[report_table_count.report_name == "Haglöfs"].negative_word_count.map(lambda d: f"{max(d, key=d.get)}: {d[max(d, key=d.get)]}"))
print(report_table_count[report_table_count.report_name == "Haglöfs"].positive_word_count.map(lambda d: f"{max(d, key=d.get)}: {d[max(d, key=d.get)]}"))

0          ILL: 3870
1    INABILITY: 2945
Name: negative_word_count, dtype: object
0    IMPROVE: 4500
1       ABLE: 2109
Name: positive_word_count, dtype: object
1    INABILITY: 2945
Name: negative_word_count, dtype: object
1    ABLE: 2109
Name: positive_word_count, dtype: object


# 6 Add Ranking

In [18]:
# Gets report table with word counts
report_table_count = pd.read_csv("report_table_count.csv")
report_table_count_ranking = report_table_count.copy()
report_table_count_ranking["report_name"] = report_table_count_ranking.report_name.map(lambda r:  unicodedata.normalize("NFC", r))
report_table_count_ranking["report_name"] = report_table_count_ranking.report_name.map(lambda c: c.upper())

# Gets report table with ranking
ranking = pd.read_csv("sample.csv")
ranking["Company"] = ranking.Company.map(lambda c: c.upper())

# Adds ranking column to the report tabe with word counts
report_table_count_ranking = report_table_count_ranking.merge(ranking, left_on="report_name", right_on="Company", how="inner")
report_table_count_ranking.drop(columns=["negative_word_count", "positive_word_count", "Company"], inplace=True)

In [19]:
report_table_count_ranking.to_csv("report_table_count_ranking.csv", index=False)

In [20]:
report_table_count_ranking.head()

Unnamed: 0,report_name,word_count,total_negative_word_count,total_positive_word_count,Ranking
0,LEAN_PRIMER,569251,23130,19845,2
1,HAGLÖFS,367232,19285,10127,17


# 7 Calculate Statistics

In [None]:
# Comment