# Statement Extractor
> Statement extraction through claim detection

This Notebook guides you through the *Statement Extractor* module.

In [1]:
from src.utils import pdf_parser
from src import segmenter, detect_claims, highlight_pdf
from config import SAMPLE_PDF_PATH, MODEL_NAME, MODEL_WEIGHTS_PATH

import pandas as pd

2021-12-30 16:48:24.852440: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-30 16:48:24.852459: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## 0. Load data

In [2]:
text = pdf_parser.pdf_to_text(SAMPLE_PDF_PATH)  # load all text from pdf

In [3]:
parse = pdf_parser.parse_text(text)  # parse pdf with specific document parser

## 1. Create segments

In [4]:
title = parse["title"]
date = parse["date"]
passages = parse["passages"]  # this document has natural passages

In [5]:
doc = []  # create empty doc

In [6]:
passage_id = 0

for passage in passages:
    segments = segmenter.split_segments(passage.get("text"), segment_len=4)
    
    for segment in segments:
        segment["passage_id"] = passage_id
        doc.append(segment)
    
    passage_id += 1

### 1.1 Load to table

In [7]:
doc_table = pd.DataFrame(doc)

In [8]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id
0,0,Guten Mittag liebe Journalistinnen und Journal...,0
1,0,Herzlich willkommen hier zu unserem virtuellen...,0
2,1,"Mein Name ist Bastian Zimmermann, und ich bin ...",0
3,1,"Ich freue mich, heute auch unsere drei Experte...",0
4,2,"Man hört immer wieder, wie wichtig Quantentech...",0


### 1.2 Add Metadata

In [9]:
doc_table["title"] = title
doc_table["date"] = date

In [10]:
doc_table["speaker"] = None
doc_table["timestamp"] = None

for idx, passage in enumerate(passages):
    speaker = passage.get("speaker")
    timestamp = passage.get("timestamp")
    
    doc_table['speaker'].loc[doc_table['passage_id'] == idx] = speaker
    doc_table['timestamp'].loc[doc_table['passage_id'] == idx] = timestamp


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)



In [11]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id,title,date,speaker,timestamp
0,0,Guten Mittag liebe Journalistinnen und Journal...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
1,0,Herzlich willkommen hier zu unserem virtuellen...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
2,1,"Mein Name ist Bastian Zimmermann, und ich bin ...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
3,1,"Ich freue mich, heute auch unsere drei Experte...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
4,2,"Man hört immer wieder, wie wichtig Quantentech...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]


## 2. (Detect main concept)

## 3. Detect claim sentences

In [12]:
detector = detect_claims.claim_detector(MODEL_NAME, MODEL_WEIGHTS_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
2021-12-30 16:50:48.951937: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-30 16:50:48.951958: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-30 16:50:48.951971: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4c625dcec243): /proc/driver/nvidia/version does not exist
2021-12-30 16:50:48.952101: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (o

In [13]:
doc_table["claim"] = doc_table.apply(lambda x: detector.is_claim(x["sentence"]), axis=1)

In [14]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id,title,date,speaker,timestamp,claim
0,0,Guten Mittag liebe Journalistinnen und Journal...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
1,0,Herzlich willkommen hier zu unserem virtuellen...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
2,1,"Mein Name ist Bastian Zimmermann, und ich bin ...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
3,1,"Ich freue mich, heute auch unsere drei Experte...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
4,2,"Man hört immer wieder, wie wichtig Quantentech...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False


In [15]:
doc_table.to_csv("cache.csv", index=False)

In [16]:
doc_table = pd.read_csv("cache.csv")

## 4. Return statements

### 4.1 Highlight claim sentences

In [17]:
relevant_claims = doc_table[(doc_table["claim"]==True)&(doc_table["speaker"]!="Moderator")]

In [18]:
claim_sentences = relevant_claims["sentence"].to_list()

In [19]:
highlight_pdf.highlight_text(claim_sentences, SAMPLE_PDF_PATH, "claims.pdf", color=["green", "yellow"])

Error, sentence not found
Error, sentence not found


### 4.2 Highlight statements

In [20]:
c = "yellow"
highlight_pdf.highlight_text([], SAMPLE_PDF_PATH, "statements.pdf")

passage_id_old = None
segment_id_old = None

for row in relevant_claims.iterrows():
    passage_id = row[1]["passage_id"]
    segment_id = row[1]["segment_id"]

    if passage_id_old==passage_id and segment_id_old == segment_id:
        continue
    else:
        sentences = doc_table[(doc_table["passage_id"]==passage_id)&(doc_table["segment_id"]==segment_id)]["sentence"].to_list()
        highlight_pdf.highlight_text(sentences, "statements.pdf", "statements.pdf", color=c)
        passage_id_old = passage_id
        segment_id_old = segment_id

        if c == "yellow":
            c = "red"
        else:
            c = "yellow"
        

Error, sentence not found
Error, sentence not found
Error, sentence not found


### 4.3 Highlight statements with claims

In [24]:
c = "yellow"
highlight_pdf.highlight_text([], SAMPLE_PDF_PATH, "statements_claims.pdf")

passage_id_old = None
segment_id_old = None

for row in relevant_claims.iterrows():
    passage_id = row[1]["passage_id"]
    segment_id = row[1]["segment_id"]

    if passage_id_old==passage_id and segment_id_old == segment_id:
        continue
    else:
        sentences = doc_table[(doc_table["passage_id"]==passage_id)&(doc_table["segment_id"]==segment_id)]["sentence"].to_list()
        highlight_pdf.highlight_text(sentences, "statements_claims.pdf", "statements_claims.pdf", color=c)
        passage_id_old = passage_id
        segment_id_old = segment_id

        if c == "yellow":
            c = "red"
        else:
            c = "yellow"
        

Error, sentence not found
Error, sentence not found
Error, sentence not found


In [25]:
highlight_pdf.highlight_text(claim_sentences, "statements_claims.pdf", "statements_claims.pdf", color="green")

Error, sentence not found
Error, sentence not found
