# Statement Extractor
> Statement extraction through claim detection

This Notebook guides you through the *Statement Extractor* module.

In [5]:
from src.utils import pdf_parser
from src import segmenter, detect_claims
from config import SAMPLE_PDF_PATH, MODEL_NAME, MODEL_WEIGHTS_PATH

import pandas as pd

## 0. Load data

In [6]:
text = pdf_parser.pdf_to_text(SAMPLE_PDF_PATH)  # load all text from pdf

In [7]:
parse = pdf_parser.parse_text(text)  # parse pdf with specific document parser

## 1. Create segments

In [8]:
title = parse["title"]
date = parse["date"]
passages = parse["passages"]  # this document has natural passages

In [9]:
doc = []  # create empty doc

In [10]:
passage_id = 0

for passage in passages:
    segments = segmenter.split_segments(passage.get("text"))
    
    for segment in segments:
        segment["passage_id"] = passage_id
        doc.append(segment)
    
    passage_id += 1

### 1.1 Load to table

In [11]:
doc_table = pd.DataFrame(doc)

In [12]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id
0,0,Guten Mittag liebe Journalistinnen und Journal...,0
1,1,Herzlich willkommen hier zu unserem virtuellen...,0
2,2,"Mein Name ist Bastian Zimmermann, und ich bin ...",0
3,3,"Ich freue mich, heute auch unsere drei Experte...",0
4,0,"Man hört immer wieder, wie wichtig Quantentech...",0


### 1.2 Add Metadata

In [13]:
doc_table["title"] = title
doc_table["date"] = date

In [14]:
doc_table["speaker"] = None
doc_table["timestamp"] = None

for idx, passage in enumerate(passages):
    speaker = passage.get("speaker")
    timestamp = passage.get("timestamp")
    
    doc_table['speaker'].loc[doc_table['passage_id'] == idx] = speaker
    doc_table['timestamp'].loc[doc_table['passage_id'] == idx] = timestamp


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)



In [15]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id,title,date,speaker,timestamp
0,0,Guten Mittag liebe Journalistinnen und Journal...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
1,1,Herzlich willkommen hier zu unserem virtuellen...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
2,2,"Mein Name ist Bastian Zimmermann, und ich bin ...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
3,3,"Ich freue mich, heute auch unsere drei Experte...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]
4,0,"Man hört immer wieder, wie wichtig Quantentech...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00]


## 2. (Detect main concept)

## 3. Detect claim sentences

In [16]:
detector = detect_claims.claim_detector(MODEL_NAME, MODEL_WEIGHTS_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
2021-12-28 13:04:35.993834: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-28 13:04:35.993889: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-28 13:04:35.993919: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4aeed0470a36): /proc/driver/nvidia/version does not exist
2021-12-28 13:04:35.994686: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (o

In [17]:
doc_table["claim"] = doc_table.apply(lambda x: detector.is_claim(x["sentence"]), axis=1)

In [18]:
doc_table.head()

Unnamed: 0,segment_id,sentence,passage_id,title,date,speaker,timestamp,claim
0,0,Guten Mittag liebe Journalistinnen und Journal...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
1,1,Herzlich willkommen hier zu unserem virtuellen...,0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
2,2,"Mein Name ist Bastian Zimmermann, und ich bin ...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
3,3,"Ich freue mich, heute auch unsere drei Experte...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False
4,0,"Man hört immer wieder, wie wichtig Quantentech...",0,„Steht der Quantenrechner vor der Tür? Forschu...,12.04.2021,Moderator,[00:00:00],False


## 4. Return statements

In [35]:
statement_ids = doc_table[(doc_table["claim"]==True) & (doc_table["speaker"]!="Moderator")]["segment_id"].unique()

In [30]:
statement_ids[0]

1

In [None]:
doc_table

In [42]:
claim_sentences = doc_table[(doc_table["claim"]==True) & (doc_table["speaker"]!="Moderator")]

In [1]:
import fitz

In [66]:
pdf = fitz.open(SAMPLE_PDF_PATH)

In [67]:
for claim_sentence in claim_sentences["sentence"].to_list():
    text_instances = []
    for page in pdf:
        instances = page.search_for(claim_sentence)
        text_instances+=instances
                
        for inst in instances:
            highlight = page.add_highlight_annot(inst)
            highlight.update()
    if not text_instances:
        print("Error, sentence not found")

Error, sentence not found
Error, sentence not found


In [68]:
pdf.save("output.pdf", garbage=4, deflate=True, clean=True)