In [1]:
from tabula import read_pdf
from tabulate import tabulate
import pandas as pd
import numpy as np
import re

import pdfplumber
import openpyxl


# Table Extraction and Mapping (SGD-GRI Link)

In [None]:
df = read_pdf('ESG-Frameworks/Mapping-Standards/sdg-gri.pdf', stream=True, pages = 3, area = [80.51, 90.42, 561.96, 814.18], multiple_tables=False )


In [None]:
for i in range(4, 73):
    df[0] = df[0].append(read_pdf('ESG-Frameworks/Mapping-Standards/sdg-gri.pdf', stream=True, pages = i, area = [80.51, 90.42, 561.96, 814.18], multiple_tables=False )[0], ignore_index=True)
    

In [None]:
df[0].rename(columns = {'Sources':'Source'}, inplace = True)

In [None]:
for i in range(74, 99):
    df[0] = df[0].append(read_pdf('ESG-Frameworks/Mapping-Standards/sdg-gri.pdf', stream=True, pages = i, area = [80.51, 90.42, 561.96, 814.18], multiple_tables=False )[0], ignore_index=True)
    

In [None]:
df[0].to_csv('ESG-Frameworks/Mapping-Standards/SDG-GRI/SDG-GRI-DF.csv')

## Structuring Dataframe SDG-GRI

## Solving Disclousure Column extraction Issue 

In [None]:
# df['Target_'] = df['Target']

In [None]:
# df = df[["Target", "Target_", "Available Business Disclosures", "Disclosure"]]
df = df[["Target", "Available Business Disclosures", "Disclosure"]]
# df

In [None]:
df = df.drop(labels = 'Target',axis = 1).groupby(df['Target'].mask(df['Target']=='').ffill()).agg(' '.join).reset_index()



In [None]:
# df.rename(columns = {'Target_':'Target'}, inplace = True)

In [None]:
df.to_csv('ESG-Frameworks/Mapping-Standards/SDG-GRI#2.csv')


## Structuring Dataframe SDG-GRI

## Mapping Collected Dataframe (SDG-GRI)

In [None]:
pdf_tables = 'ESG-Frameworks/Mapping-Standards/SDG-GRI.csv'
excelFile = 'ESG-Frameworks/Outputs/testing.xlsx'
sheet = "SDG's"
regex = '[+-]?[0-9]+\.-?[0-9a-zA-Z_]+'

wb = openpyxl.load_workbook(excelFile)

ws = wb[sheet]
# ws = wb.active

rows = ws.max_row

pdf_tables = pd.read_csv(pdf_tables)

for i in range(1, rows):
    
    if ws.cell(row=i, column=1).value == None:
        pass
    
    else:
        target_cell = ws.cell(row=i, column=1).value
        if(re.search(regex, target_cell)):
            target = re.search(regex, target_cell).group()
            
            try:
                value_to_add = pdf_tables.loc[pdf_tables['Target'] == target]['Disclosure'].item()
                ws.cell(row=i, column=3, value=str(value_to_add))
            except:
                pass

           
wb.save("ESG-Frameworks/Outputs/testing_2.xlsx")
            


        

# wb.save("ESG-Frameworks/Outputs/testing_2.xlsx")

# ws.cell(row=5, column=1).value

## Mapping Collected Dataframe (GRI-SDG)

In [None]:
pdf_tables = 'ESG-Frameworks/Mapping-Standards/GRI-SDG.csv'
excelFile = 'ESG-Frameworks/Outputs/testing_2.xlsx'
sheet_2 = "GRI 2016"

wb = openpyxl.load_workbook(excelFile)

ws = wb[sheet_2]


rows = ws.max_row

pdf_tables = pd.read_csv(pdf_tables)

for i in range(1, rows):
    
    if ws.cell(row=i, column=2).value != None:
        
        target_cell = ws.cell(row=i+1, column=2).value
        if target_cell != None:
            try:
                
                if len(pdf_tables[pdf_tables['Disclosure'] == target_cell]) != 0:
                    value_to_add = pdf_tables[pdf_tables.Disclosure==target_cell].squeeze()['Target'].values
                    ws.cell(row=i+1, column=4, value=', '.join(value_to_add))
            except:
                pass
        


           
wb.save("ESG-Frameworks/Outputs/testing_2.xlsx")
            


# Table Extraction and Mapping (GRI-COH4B Link)

In [150]:
pdf = pdfplumber.open('ESG-Frameworks/Mapping-Standards/GRI-COH4B/gri-coh4b.pdf')

frames = []

for i in range(12, len(pdf.pages)):
    
    try:
        page = pdf.pages[i]
        table = page.extract_table()
        frames.append(pd.DataFrame(table))
    except:
        pass

df =  pd.concat(frames)
df = df.drop_duplicates()
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
df.rename(columns = {'A. COHBP & \ndefinition':'id', None:'A. COHBP & \ndefinition'}, inplace = True)
# df = df.drop(['id'], axis=1)

# df = df.replace(r'\n',' ', regex=True)
df['id'] = df['id'].str[:-1] + df['id'].str[-1] + '.'
df.to_csv('ESG-Frameworks/Mapping-Standards/GRI-COH4B/GRI-COH4B.csv')

## Mapping Collected Dataframe (GRI-COH4B)

In [151]:
pdf_tables = 'ESG-Frameworks/Mapping-Standards/GRI-COH4B/GRI-COH4B.csv'
excelFile = 'ESG-Frameworks/Outputs/testing_2.xlsx'
sheet = "COH4B"
regex = "[+-]?[0-9]+\."

wb = openpyxl.load_workbook(excelFile)

ws = wb[sheet]


rows = ws.max_row

pdf_tables = pd.read_csv(pdf_tables)
pdf_tables['id'] = pdf_tables['id'].astype(str).apply(lambda x: x.replace('.0','.'))

for i in range(1, rows):
    
    if ws.cell(row=i, column=1).value == None:
        pass
    
    else:
        target_cell = ws.cell(row=i, column=1).value
        # print(target_cell)
        if(re.search(regex, target_cell)):
            target = re.search(regex, target_cell).group() 
            # print(target)
            try:
                value_to_add = [pdf_tables.loc[pdf_tables['id'] == target]['C. GRI \nStandards'].item(),
                                pdf_tables.loc[pdf_tables['id'] == target]['D. GRI disclosures'].item()]
                # print(value_to_add)
                ws.cell(row=i, column=3, value=str(value_to_add[0]))
                ws.cell(row=i, column=4, value=str(value_to_add[1]))
                
            except:
                pass

           
wb.save("ESG-Frameworks/Outputs/testing_2.xlsx")
            

## Mapping Collected Dataframe (COH4B-GRI)