## Import Packages

In [2]:
import re, csv, sys, nltk, PyPDF2, glob, os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import pandas as pd

## Import the CSV file and convert it into two dictionaries: one for unigrams and one for bigrams

In [3]:
csv_file = 'ESGword.csv'
unigram_dict = defaultdict(list)
bigram_dict = defaultdict(list)

# getting ESG words from csv file and storing in dictionaries
with open(csv_file, 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        for column_header, column_value in row.items():
            # Determine whether it's a bigram or unigram based on the column header
            is_bigram = 'bigrams' in column_header.lower()

            # Extract the key (first word of the column header) and convert to lowercase
            key = column_header.split()[0].lower()

            # Add the non-empty, lowercase values to the appropriate dictionary
            values = [value.strip().lower() for value in column_value.split(',') if value.strip()]
            if is_bigram:
                bigram_dict[key].extend(values)
            else:
                unigram_dict[key].extend(values)

## Functions

### Extract pdf text 

In [4]:
def extractText(filepath):
    pdfFileObj = open(filepath, 'rb')
     
    # creating a pdf reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
     
    # extracting text from page
    combined_text = ''
    # Print out all the text.
    for info in pdfReader.pages:
        combined_text = combined_text + ' '+ info.extract_text()
        
    text = re.sub("\n", " ", combined_text)
    text = re.sub(r'\d+', ' ', text)
 
    # closing the pdf file object
    pdfFileObj.close()
    return text


### Print Frequency Distribution

In [5]:
def print_fdist(items, num_common=20):
    fdist = nltk.FreqDist(items)
    most_common = fdist.most_common(num_common)
    print(*most_common, sep="\n")
    print("")
    return fdist

### Function to Tokenize all the word

In [6]:
def alltokens(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    return tokens

In [7]:
punct_list = [".", ",", "?", "“", "”", "’", ";", "!", '—', '‘',"''",'``','%','•', "*", '$', ':', '&', '(', ')', '|','<','-','–','/']

stoplist = stopwords.words('english') 
stoplist.extend(punct_list)
stoplist.extend(["much", "like", "one", "many", "though", "without", "upon","also","'s","may","across","part", "percent","could","would",'often','usually'])
stoplist.extend(['data','nr','fiscal','table','index','u.s.','reporting','company','percentage','statements','includes','use'])
stoplist.extend(['total','business','continued','content','including','overview','year','number','category','pp','new','gid','/gid','pages','page','nr','p.'])

### Create Pie Chart

In [26]:
def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{:.1f}%\n({v:d})'.format(pct, v=val)
    return my_format
    
def pie_chart(freqDist, keyword_dict, filepath):
    topics = ["environmental", "social", "governance"]
    counts = {"environmental": 0, "social": 0, "governance": 0}
    for dist in freqDist:
        word = dist[0]
        count = dist[1]
        for topic in topics:
            if word.lower() in keyword_dict[topic]:
                counts[topic] = counts[topic] + count
    y = [counts["environmental"], counts["social"], counts["governance"]]
    y = np.array(y)
    
    textprops = {"fontsize":15}
    colors=["forestgreen","cornflowerblue","darkorange"]
    # # autopct='%1.0f%%'
    # print(counts)

    plt.pie(y, labels=topics, autopct=autopct_format(y), colors=colors, textprops = textprops)
    # plt.pie(y, labels=topics, colors=colors, textprops = textprops)
    plt.savefig(filepath)
    plt.close()

### Save WordCloud

In [9]:
def saveWordCloud(text, company_name,types):
    wordcloud = WordCloud(max_font_size=40).generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    filepath = f"WordClouds/{types}/{company_name}_{types}.png"
    plt.savefig(filepath,bbox_inches='tight', pad_inches=0)
    plt.close()
    # plt.show()

### Analyze Unigram with Company Name (Create WordCloud, Frequency Distribution, PieChart)

In [10]:
def analyzeUnigram(tokens, company_name):
    no_token = len(tokens)
    print(f"The number of tokens is {no_token}")
    no_type = len(set(tokens))
    print(f"The number of type is {no_type}")
    

    # Take out Stopwords from Tokens
    allcontenttokens = [w.lower() for w in tokens if w.lower() not in stoplist and not bool(re.search(r'\d', w))]
    no_contenttoken = len(allcontenttokens)
    print(f"The number of content tokens is {no_contenttoken}")

    # Lemmatization
    lemmatizer = WordNetLemmatizer() 
    all_lemmas = [lemmatizer.lemmatize(w) for w in allcontenttokens]
    print(f"Unigram analysis for {company_name}")
    no_lemmas = len(all_lemmas)
    print(f"The number of lemmas is {no_lemmas}\n")
    text = " ".join(all_lemmas)
    
    # Create WordCloud
    saveWordCloud(text, company_name,"With_Company_Name")

    # Create Frequency Distribution
    # freqdist = nltk.FreqDist(all_lemmas)
    freqdist = print_fdist(all_lemmas)
    freqdist = list(freqdist.items())

    # Create a Pie Chart
    pie_chart(freqdist, unigram_dict, f"PieChartsUnigram/{company_name}.png")
    return no_token, no_type, no_contenttoken, no_lemmas

### Analyze Unigram without Company names (Create WordCloud)

In [11]:
def analyzeUnigram_without_companyname(tokens, company_name,clist):
    # Take out Stopwords from Tokens
    allcontenttokens = [w.lower() for w in tokens if w.lower() not in clist and not bool(re.search(r'\d', w))]
    # Lemmatization
    lemmatizer = WordNetLemmatizer() 
    all_lemmas = [lemmatizer.lemmatize(w) for w in allcontenttokens]
    text = " ".join(all_lemmas)
    # Create WordCloud
    saveWordCloud(text,company_name,"No_Company_Name")

### Analyze Bigrams (Create Frequency Distribution, PieChart)

In [12]:
def analyzeBigram(tokens, company_name):
    bigrams = nltk.ngrams(tokens, 2)
    bigramlist = list(bigrams)
    bigramlist_new = []

    for bi in bigramlist:
        if bi[0] in stoplist or bi[1] in stoplist:
            continue
        bigramlist_new.append(bi)
        
    print(f"Bigram analysis for {company_name}")
    print(f"The number of content bigrams is {len(bigramlist_new)}")

    bigramlist_stringify = [pair[0] + " " + pair[1] for pair in bigramlist_new]
    # freqdist = nltk.FreqDist(bigramlist_stringify)
    freqdist = print_fdist(bigramlist_stringify)
    freqdist = list(freqdist.items())
    
    pie_chart(freqdist, bigram_dict, f"PieChartsBigram/{company_name}.png")


### Create a file and replace it if it exists

In [13]:
import os, shutil
def create_replace_file(foldername):
    if os.path.exists(foldername):
        shutil.rmtree(foldername)
    os.makedirs(foldername)


## Main function

### Create 1 txt file for each industry and each company  

In [17]:
# main function
company_list = []

create_replace_file("Textfiles")
for industry in glob.iglob(f"Reports/*"):
    industry_name = industry.split("/")[-1]
    
    
    ind_file = open(f"Textfiles/{industry_name}.txt", "a") # text file combining text from all industry companies
    ## Create files for 
    os.makedirs(os.path.join("Textfiles", industry_name), exist_ok=True)

    for company in glob.iglob(f"{industry}/*"):
        company_name = company.split("/")[-1].strip(".pdf")
        lowercase_company_name = company_name.lower()
        company_list.append(lowercase_company_name)

        # Extract the Text for each company
        text = extractText(company)
        with open(f"Textfiles/{industry_name}/{company_name}.txt", "w") as file:
            file.write(text)
            print(f"\n\n{company_name} text extracted")
        ind_file.write(text)
    print(f"\n\n{industry_name} text extraction complete")



Cisco text extracted


Apple text extracted


Amazon text extracted


Technology text extraction complete


Ross text extracted


Target text extracted


Tjx text extracted


Retail text extraction complete


GeneralMills text extracted


Hershey text extracted


KraftHeinz text extracted


Food text extraction complete


American Airline text extracted


Delta text extracted


Southwest Airlines text extracted


Airlines text extraction complete


Disney text extracted


Netflix text extracted


Fox text extracted


Media text extraction complete


In [16]:
# main function
pdf_texts = {}
company_list = []

for industry in glob.iglob(f"Reports/*"):
    industry_name = industry.split("/")[-1]
    
    
    ind_file = open(f"Textfiles/{industry_name}.txt", "a") # text file combining text from all industry companies
    ## Create files for 

    for company in glob.iglob(f"{industry}/*"):
        company_name = company.split("/")[-1].strip(".pdf")
        lowercase_company_name = company_name.lower()
        company_list.append(lowercase_company_name)

### Create a list of company names

In [17]:
print(company_list)
company_list.extend(["mills","kraft","heinz","southwest","american","2022","gid","nr"])

stop_and_company_list =  stoplist+ company_list

['cisco', 'apple', 'amazon', 'ross', 'target', 'tjx', 'generalmills', 'hershey', 'kraftheinz', 'american airline', 'delta', 'southwest airlines', 'disney', 'netflix', 'fox']


### Create diagrams for each company and industry

In [27]:
os.makedirs("WordClouds", exist_ok=True)
os.makedirs("PieChartsUnigram", exist_ok=True)
os.makedirs("PieChartsBigram", exist_ok=True)

comp_list = []
industry_list = []
for industry in glob.iglob("Textfiles/*"):
    
    
    if os.path.isdir(industry):
        print(industry)
        industry_name = industry.split("/")[-1]
        industry_tokens = []
        os.makedirs(os.path.join("WordClouds", "With_Company_Name"), exist_ok=True)
        os.makedirs(os.path.join("WordClouds", "No_Company_Name"), exist_ok=True)
        
        for company in glob.iglob(f"{industry}/*"):
            print(company)

            company_name = company.split("/")[-1].split(".")[0]


            # get text from txt file of words extracted from pdf reports
            with open(f"Textfiles/{industry_name}/{company_name}.txt", "r") as file:
                text = file.read()

            # Save the content tokens for each company
            print(f"Starting analysis for {company_name}")
            text_tokens = alltokens(text)
            industry_tokens.extend(text_tokens)
            
            
            comp_token, comp_type, comp_contenttoken, comp_lemmas = analyzeUnigram(text_tokens, company_name)
            comp_list.append({'Industry': industry_name, 'Company': company_name, 'Token': comp_token, 'Type': comp_type, 'Content Token':comp_contenttoken, 'Lemmas':comp_lemmas})
            
            analyzeUnigram_without_companyname(text_tokens, company_name,stop_and_company_list)
            analyzeBigram(text_tokens, company_name)
            
        industry_token, industry_type, industry_contenttoken, industry_lemmas = analyzeUnigram(industry_tokens, industry_name)
        industry_list.append({'Industry': industry_name, 'Token': industry_token, 'Type': industry_type, 'Content Token':industry_contenttoken, 'Lemmas':industry_lemmas})
        
        analyzeUnigram_without_companyname(industry_tokens, industry_name, stop_and_company_list)
        analyzeBigram(industry_tokens, industry_name)

#Create CSV file so easier to compare numbers and transfer to Goggle Slides
comp_df = pd.DataFrame(comp_list)
comp_df.to_csv("company_stats.csv", index=False)

industry_df = pd.DataFrame(industry_list)
industry_df.to_csv("industry_stats.csv", index=False)


print("Done")

Textfiles/Technology
Textfiles/Technology/Cisco.txt
Starting analysis for Cisco
The number of tokens is 24178
The number of type is 3482
The number of content tokens is 12382
Unigram analysis for Cisco
The number of lemmas is 12382

('cisco', 370)
('power', 117)
('inclusive', 108)
('esg', 106)
('employee', 94)
('purpose', 91)
('hub', 89)
('product', 87)
('impact', 83)
('report', 82)
('goal', 76)
('people', 74)
('community', 71)
('u', 70)
('work', 66)
('supplier', 65)
('program', 61)
('global', 60)
('technology', 59)
('future', 57)

Bigram analysis for Cisco
The number of content bigrams is 5286
('purpose report', 59)
('cisco purpose', 56)
('intro power', 56)
('power inclusive', 56)
('inclusive cisco', 55)
('human rights', 26)
('social justice', 24)
('future intro', 23)
('networking academy', 20)
('supply chain', 18)
('circular design', 17)
('inclusive future', 16)
('ghg emissions', 16)
('hybrid work', 16)
('climate change', 15)
('cisco networking', 15)
('community impact', 15)
('fy bas

## TF-IDF for each industry

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob

text_files = glob.glob("Textfiles/*.txt")
file_names = [Path(text).stem for text in text_files]

tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=file_names, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df = tfidf_df.drop(columns=company_list, errors='ignore')
# tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.stack().reset_index()

tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
tfidf_df = tfidf_df[~tfidf_df['term'].isin(company_list)]
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [21]:
## Create a heat map
import altair as alt
import numpy as np


# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle', fontSize=14).encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width=900,height = 350)