In [None]:
from google.colab  import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# installing necessary libraries
# !pip install  pdfminer.six pdfplumber
!pip install langchain_google_genai
!pip install -U -qq "transformers>=4.35.0" accelerate langchain chromadb "autoawq>=0.1.6" unidic_lite


# 1. The first one perform layout analysis and data parsing
# 2. The second one perform table extraction

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfplumber
  Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.4/56.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.28.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.0 pypdfium2-4.28.0
Collecting langchain_google_genai
  Downloading langchain_google_genai-0.0.9-py3-none-any.whl (17 kB)
Collecting langchain-core<0.2,>=0.1 (from langchain_google_genai)
  Downloading

In [None]:
# Text and table extraction

In [None]:
from pdfminer.high_level import extract_pages,extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure

import pdfplumber
import requests
import pandas as pd
from tqdm import tqdm
import re

In [None]:
def normalize_text(line_texts):
   norm_text = ''
   for line_text in line_texts:
       line_text=line_text.strip()
       # empty strings after striping convert to newline character
       if not line_text:
           line_text = '\n'
       else:
           line_text = re.sub('\s+', ' ', line_text)
           # if the last character is not a letter or number, add newline character to a line
           if not re.search('[\w\d\,\-]', line_text[-1]):
               line_text+='\n'
           else:
               line_text+=' '
       # concatenate into single string
       norm_text+=line_text
   return norm_text


def text_extraction(element):
   # Extract text from line and split it with new lines
   line_texts = element.get_text().split('\n')
   line_text = normalize_text(line_texts)
   return line_text


def convert_table(table):
   table_string = ''
   # iterate through rows in the table
   for row in table:
       # clean row from newline character
       cleaned_row = [
           'None' if item is None else item.replace('\n', ' ')
           for item in row
       ]
       # concatenate the row as a string with the whole table
       table_string += f"|{'|'.join(cleaned_row)}|\n"
   return table_string.rstrip('\n')

In [None]:
def process_page(page, extracted_page, text=True, table=True):
   content = []

   # Find the tables in the page
   tables = page.find_tables()
   extracted_tables = page.extract_tables()


   table_num = 0
   first_table_element = True
   table_extraction_process = False


   # Get a sorted list of elements based on their Y-coordinate in reverse order
   elements = [element for element in extracted_page._objs]
   elements.sort(key=lambda a: a.y1, reverse=True)


   lower_side = 0
   upper_side = 0
   for i, element in enumerate(elements):
       # Extract text if the element is a text container and text extraction is enabled
       if isinstance(element, LTTextContainer) and not table_extraction_process and text:
           line_text = text_extraction(element)
           content.append(line_text)


       # Process tables if the element is a rectangle and table extraction is enabled
       if isinstance(element, LTRect) and table:
           if first_table_element and table_num < len(tables):
               lower_side = page.bbox[3] - tables[table_num].bbox[3]
               upper_side = element.y1


               table = extracted_tables[table_num]
               table_string = convert_table(table)
               content.append(table_string)
               table_extraction_process = True
               first_table_element = False


           # Check if we have already extracted the tables from the page
           if element.y0 >= lower_side and element.y1 <= upper_side:
               pass
           elif i + 1 >= len(elements):
               pass
           elif not isinstance(elements[i + 1], LTRect):
               table_extraction_process = False
               first_table_element = True
               table_num += 1


   # Combine and clean up the extracted content
   content = re.sub('\n+', '\n', ''.join(content))
   return content

In [None]:
def process_document(pdf_path, text=True, table=True, page_ids=None):
   pdf = pdfplumber.open(pdf_path)
   pages = pdf.pages

   # Extract pages from the PDF
   extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)

   page2content = {}

   # Process each extracted page
   for extracted_page in tqdm(extracted_pages):
       page_id = extracted_page.pageid
       content = process_page(pages[page_id - 1], extracted_page, text, table)
       page2content[page_id] = content

   return page2content

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.1.0-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.1.0


In [None]:
# from langchain_core.documents.base import Document
# import glob
# docs=[]
# pdfs_folder='/content/gdrive/MyDrive/unifydata/'
# for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):

#   page2content = process_document(pdf_file, page_ids=None)

#   for _,value in page2content.items():
#     docs.append(Document(value))

#   page2content.clear()

import glob
#  miscellinious
from langchain.document_loaders import PyPDFLoader # loads a given pdf


docs=[]

# # Load pdfs
pdfs_folder='/content/gdrive/MyDrive/unifydata/'
for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):
   loader = PyPDFLoader(pdf_file)
   doc = loader.load()
   docs.extend(doc)



In [None]:
len(docs)
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1


In [None]:
# imports
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores import DocArrayInMemorySearch, Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch



# initialize embeddings with GoogleGenerativeAIEmbeddings
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key = "AIzaSyAxnVOuLEjDev8Zy-Oz_H5l-yXVDKq7Dm0")
device = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-large",
        model_kwargs={"device": device}
    )


def create_vector_store(data):
   # initiliaze RecursiveCharacterTextSplitter class
   text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  # spit business data
   split_text = text_splitter.split_documents(data)
  #  # create the vector storage using DocArrayInMemorySearch
   vectorstore = Chroma.from_documents(split_text, embeddings)
   return vectorstore

vectorstore = create_vector_store(docs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [None]:
from google.colab import userdata
import google.generativeai as genai

GOOGLE_API_KEY = userdata.get('Gemini')
genai.configure(api_key=GOOGLE_API_KEY)
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
def process_string(s):
    # Check the length of the string
    if len(s) > 20:
        return 0

    # Remove alphabetic characters at the beginning of the string
    s = re.sub(r'^[a-zA-Z]*', '', s)

    # Remove spaces
    s = s.replace(' ', '')

    # Try to convert the string to a float
    try:
        return float(s)
    except ValueError:
        # If the string ends with a percentage symbol
        if s.endswith('%'):
            s = s.replace(',', '.').rstrip('%')
            return float(s)
        else:
            # Remove commas
            s = s.replace(',', '')

            # Check for 'billion' or 'million' at the end
            if s.endswith('billion'):
                return float(s.rstrip('billion'))
            elif s.endswith('million'):
                return float(s.rstrip('million'))
            else:
              if s=='':
                return 0
              else:
                return float(re.sub(r'[^0-9.]', '', s))

In [None]:
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.chat_models import ChatOpenAI



# initialize prompt template
prompt_template = """
    You are an AI assistant which analyse reports  of entrepreses to fetch in anuel Environmental, Social, and Governance metrics.
    Your goal is to retrieve the number representing the ESG of a given company, You should only retrieve a number.
    Make sure the retrieved number is for the company as specified in the question below:
    if you don't have information about the question context don't hallucinate.


    Context : \n {context} \n
    Question: \n {question} \n
    Answer:
    """

# initialize an output parser
output_parser = StrOutputParser()

def runnable_map(question):

  model = ChatGoogleGenerativeAI(model="gemini-1.0-pro-latest",
                                 temperature = 0,
                                 google_api_key = "AIzaSyAxnVOuLEjDev8Zy-Oz_H5l-yXVDKq7Dm0")


  model.safety_settings={
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
  }

  # we set up a retriever from our vectorestore which we will use to get relevant context from our query
  retriever = vectorstore.as_retriever()
  # create prompt from prompt template
  prompt = ChatPromptTemplate.from_template(prompt_template)
  # initialize the RunnableMap class
  chain = RunnableMap({
        "context": lambda x: retriever.get_relevant_documents(x["question"]),
        "question": lambda x : x["question"],
    },

                      ) | prompt | model  | output_parser
# invoke the RunnableMap
  response = chain.invoke({"question": question})
  print(model.safety_settings)
  print(response)
  return process_string(response)

runnable_map('What is the in B-BBEE Scorecard Level  by the Oceana1&2 group in 2022')

{<HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <HarmBlockThreshold.BLOCK_NONE: 4>}
7


7.0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Metric_syno=pd.read_csv(pdfs_folder + 'ActivityMetricsSynonyms.csv')
Metric_syno['ID'] = Metric_syno.apply(lambda row: f"{row['AMKEY']}_X_{row['Group']}", axis=1)
Metric_syno['Query'] = Metric_syno.apply(lambda row: f"What is the  {row['ClientMetric']} or {row['ActivityMetric']}  by the {row['Group'].upper()} entrepreses in 2022?", axis=1)


In [None]:
result = Metric_syno['Query'].apply(runnable_map)
prediction=pd.DataFrame({'ID':Metric_syno["ID"], 'AMKEY':result})
prediction.to_csv('AMKEY.csv',index=False)
result.value_counts()

{<HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <HarmBlockThreshold.BLOCK_NONE: 4>}
I do not have access to the information needed to answer this question.
{<HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <HarmBlockThreshold.BLOCK_NONE: 4>}
The provided context does not mention SASOL entrepreses or any information about Nitrogen oxides (NOx) (kilotons) or Air emissions of the following pollutants: (2) NOx (excluding N2O) in 2022, so I cannot answer this question from the provided context.
{<HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <HarmBlockThreshold.BLOCK_NONE: 4>}
The provided context does not mention SASOL entrepreses, so I cannot answer this question from the provided context.
{<HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_

0.0        230
11.0         4
4.0          3
31.0         3
10.0         3
          ... 
128.0        1
226.0        1
15163.0      1
3.0          1
83.3         1
Name: Query, Length: 118, dtype: int64

In [None]:
sample=pd.read_csv(pdfs_folder + 'SampleSubmission.csv')
prediction=pd.read_csv(pdfs_folder + 'AMKEY.csv')

# # Create a dictionary mapping IDs to amkey values from the second dataset
# id_amkey_mapping = dict(zip(prediction['ID'], prediction['AMKEY']))

sample.rename(columns={'2022_Value': 'AMKEY'}, inplace=True)

# # Update amkey values in the first dataset based on the mapping
# sample['AMKEY'] = sample['ID'].map(id_amkey_mapping)

# sample.head()

for index, row in sample.iterrows():
    # Get the ID and amkey from the first dataset
    id_value = row['ID']

    # Find the corresponding row in the second dataset based on ID
    corresponding_row = prediction[prediction['ID'] == id_value]

    # Check if a corresponding row exists in the second dataset
    if not corresponding_row.empty:
        # Get the amkey value from the corresponding row in the second dataset
        new_amkey_value = corresponding_row.iloc[0]['AMKEY']

        # Update the amkey value in the first dataset
        sample.at[index, 'AMKEY'] = new_amkey_value


sample.fillna(0,inplace=True)
sample.head()
sample.to_csv(pdfs_folder + 'ESV.csv',index=False)