# Create the SMC claim dataset
This notebook guides through the dataset creation process. The press briefings are scraped from the SMC website, parsed into structured texts, enriched with topics, and imported into an SQLite database.

**Table of content:**
1. Scrape data
2. Create database
3. Parse data
4. Split sentences
5. Sentence Wikification
6. Topic detection

In [1]:
import os
import random
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk import sent_tokenize

from config import BASE_DIR, BASE_URL, DB_PATH, METADATA_PATH
from src import create_db, load_data, parse_pdf, wikify

### 1. Scrape data

In [4]:
if not len(os.listdir(os.path.join(BASE_DIR, "pdf"))) > 0:
    print("--- Scraping Press Briefing Data ---")

    pressbriefing_urls = load_data.extrect_all_pressbriefing_links()  # load all urls

    # Save PDFs
    results = []
    for url in pressbriefing_urls:
        responds = requests.get(url)
        bs = BeautifulSoup(responds.content, "html.parser")
        try:
            introduction = load_data.extrect_introduction(bs)
        except:
            introduction = None
            print("ERROR: Could not load introduction:", url)
        try:
            pdf_url = load_data.extrect_pdf_url(bs)
        except:
            pdf_path = None
            print("ERROR: Could not load pdf path:", url)
        try:
            pdf_path = load_data.load_pdf_from_url(pdf_url, os.path.join("data", "SMC_dataset", "pdf"))
        except:
            pdf_path = None
            print("ERROR: Could not load pdf:", url)


        pb = {
            "introduction": introduction,
            "pdf_path": pdf_path,
            "pdf_url": pdf_url,
            "url": url
        }
        results.append(pb)
        time.sleep(random.randint(1, 7))  # random sleep

    # Save metadata.csv
    df = pd.DataFrame(results)
    df["pdf_url"] = df.apply(lambda x: x["pdf_url"] if x["pdf_url"].startswith("http") else BASE_URL + x["pdf_url"], axis = 1)
    df["pdf_url"] = df.apply(lambda x: None if x["pdf_url"] == BASE_URL else x["pdf_url"], axis=1)
    df.to_csv(METADATA_PATH, index=False)

### 2. Create database

In [5]:
if not os.path.exists(DB_PATH):
    db = create_db.create_connection(DB_PATH)
    create_db.create_tables(db)


### 3. Parse data

In [6]:
metadata = pd.read_csv(METADATA_PATH)
metadata = metadata.dropna().reset_index(drop=True)  # delete na rows

In [7]:
exclude = [
    "Transkript_Versorgungssituation_Krankenhaeuser_SMC-virtuelles-Press-Briefing_20202_03_11.pdf",
    "Transkript_vPB_Wasserstoffstrategie.pdf",
    "Trankript_Duerre-Landwirtschaft-Waelder_SMC-Press-Briefing_2020-05-05.pdf",
    "Transkript_Corona-und-Klima_SMC-Press-Briefing_2020-04-16.pdf",
    "Transkript_virPB_Kinder_COVID.pdf",
    "Transkript_Heinsberg-Studie_Ergebnisse_SMC-Press-Briefing_2020-05-04.pdf",
    "Transkript_Atomenergie-und-Klimawandel_SMC-PressBriefing_2020-02-26.pdf",
    "Transkript_SMC_Press_Briefing_Machine_Learning_Medizin_180518.pdf",
    "Transkript_gesundeStaedte_vPressBriefing_30112020.pdf",
    "Transkript_CO2-Emissionen-im-Corona-Jahr_SMC-Press-Briefing_2020-12-10.pdf",
    "Transkript_vPB_Mutationen_SARSCoV2.pdf",
    "Transkript_Die-_neue-GAP_SMC-Press-Briefing_20210316.pdf",
    "Transkript_Modellierungen_COVID_SMC_virutelles_Press-Briefing_07-05-2020.pdf",
]

In [8]:
for pdf in metadata.iterrows():
    # add metadata
    pdf_path = pdf[1]["pdf_path"]
    pdf_url = pdf[1]["pdf_url"]
    introduction_text = pdf[1]["introduction"]
    
    if pdf_path.replace("data/SMC_dataset/pdf/", "") in exclude:  # exclude some pdfs
        continue

    
    # read pdf file
    head, body = parse_pdf.read_pdf("../" + pdf[1]["pdf_path"])  # parse pdf
    fulltext = " ".join(body)
    fulltext_clean = parse_pdf.sanetize(fulltext)

    # parse head
    head_metadata = parse_pdf.parse_head(head)
    title = head_metadata.get("title")
    date = head_metadata.get("date")
    video_url = head_metadata.get("video_url")

    # parse body
    segments = parse_pdf.parse_body(body)
    persons = list(set([part.get("speaker") for part in segments]))

    if not title:
        continue

    # insert parsed press briefing
    connection = create_db.create_connection(DB_PATH)
    cur = connection.cursor()

    # instert press briefing
    command = ("""
    INSERT INTO Press_Briefing(
        pdf_path, pdf_url, introduction_text, fulltext, fulltext_clean, title, date, video_url)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """)
    cur.execute(command, (pdf_path, pdf_url, introduction_text, fulltext, fulltext_clean, title, date, video_url))
    connection.commit()

    pb_ID = cur.execute("SELECT last_insert_rowid()").fetchone()[0]  # get pb ID

    # insert Person
    person_ids = {}
    for person in persons:
        if person:
            person_ID = cur.execute("SELECT person_ID FROM Person WHERE name=?", (person,)).fetchone()  # check if person exist
            if person_ID:
                cur.execute("INSERT INTO is_guest (pb_ID, person_ID) VALUES (?, ?)", (pb_ID, person_ID[0]))
                person_ids[person] = person_ID[0]
            else:
                cur.execute("INSERT INTO Person (name) VALUES (?)", (person,))  # add person
                person_ID = cur.execute("SELECT last_insert_rowid()").fetchone()[0]  # get person id
                cur.execute("INSERT INTO is_guest (pb_ID, person_ID) VALUES (?, ?)", (pb_ID, person_ID))
                person_ids[person] = person_ID
            connection.commit()
    
    # insert segments
    for segment in segments:

        if segment.get("text"):  # db constarint
            cur.execute("INSERT INTO Segment (pb_ID, speaker, text, timecode) VALUES (?, ?, ?, ?)", (pb_ID, person_ids.get(segment.get("speaker")), segment.get("text"), segment.get("timecode")))
            connection.commit()
    connection.close()


### 4. Split sentences

In [9]:
connection = create_db.create_connection(DB_PATH)
cur = connection.cursor()
segments = cur.execute("SELECT * FROM Segment").fetchall()

for segment in segments:
    sentences = sent_tokenize(segment[2])
    for sentence in sentences:
        cur.execute("INSERT INTO Sentence (segment_ID, pb_ID, sentence) VALUES (?, ?, ?)", (segment[0], segment[1], sentence))
        
connection.commit()
connection.close()

### 5. Sentence Wikification

In [7]:
# token = os.environ.get("DANDELION_TOKEN")
token = os.environ.get("TAGME_TOKEN")

# Save file with ids already wikifyed
with open("wikifyed.txt", "r") as save_file:
    done_ids = []
    for done_id in save_file.readlines():
        done_ids.append(int(done_id.replace("\n", "")))

connection = create_db.create_connection(DB_PATH)  # DB connection
cur = connection.cursor()

sentences = cur.execute("SELECT sentence, sentence_ID FROM Sentence").fetchall()


with open("wikifyed.txt", "a") as save_file:
    for sentence in sentences:
        if sentence[1] not in done_ids:
            concepts = wikify.wifify(sentence[0], service="tagme", token=token)  # wikify
            if concepts != "Error":
                if concepts:
                    for concept in concepts:
                        cur.execute("INSERT INTO Sentence_Wikification (sentence_ID, term, wiki_num, confidence, url) VALUES (?, ?, ?, ?, ?)", (sentence[1], concept.get("title"), concept.get("id"), concept.get("link_probability"), concept.get("uri")))
                    connection.commit()
                save_file.write("\n"+str(sentence[1]))  # save ids
            else:
                connection.close()
                break

connection.close()

### 6. Topic detection

In [23]:
token = os.environ.get("DANDELION_TOKEN")

In [25]:
# Title topic
connection = create_db.create_connection(DB_PATH)  # DB connection
cur = connection.cursor()

title_all = cur.execute("SELECT pb_ID, title FROM Press_Briefing").fetchall()  # get all titles

for title in title_all:
    concepts = wikify.wifify(title[1], service="dandaleon", token=token)  # wikify
    if concepts:
        for concept in concepts:
            cur.execute("INSERT INTO pb_Wikification_title (pb_ID, term, wiki_num, confidence, url) VALUES (?, ?, ?, ?, ?)", 
            (title[0], concept.get("title"), concept.get("id"), concept.get("confidence"), concept.get("uri")))
        connection.commit()
connection.close()

In [92]:
# Introduction topic
connection = create_db.create_connection(DB_PATH)  # DB connection
cur = connection.cursor()

introductions = cur.execute("SELECT pb_ID, introduction_text FROM Press_Briefing").fetchall()

for introduction in introductions:
    text = introduction[1][:2000].replace("https://www.sciencemediacenter.de/alle-angebote", "").replace("\xa0", "").replace("\n", "")
    concepts = wikify.detect_main_concept(text, num_entetys=5, token=token)  # wikify
    if concepts:
        for concept in concepts:
            cur.execute("INSERT INTO pb_Wikification_intro (pb_ID, wiki_num, confidence, url) VALUES (?, ?, ?, ?)",
            (introduction[0], concept.get("id"), concept.get("confidence"), concept.get("uri")))
        connection.commit()   
    else:
        print(introduction[0])
connection.close()