# Data Preparation for Data Labeling

split review texts (patient_review, headline, pro, contra) into sentences so that each datapoint is a sentence.


In [1]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.data import load

In [4]:
# Read review CSV
reviews_df = pd.read_csv("data/reviews_with_comments_count.csv", sep=",")

r, c = reviews_df.shape
print(f"The review data has {r} rows and {c} columns")

The review data has 486 rows and 26 columns


In [5]:
# Function to add a space after punctuation if missing
def add_space_after_punctuation(text):
    if pd.isna(text):
        return text
    # Regular expression to find sequences of punctuation followed by a non-whitespace character
    pattern = re.compile(r'([!?.,;]+)([a-zA-Z])')
    # Add a space after the punctuation sequence
    return pattern.sub(r'\1 \2', text)

In [None]:
# Dictionary of abbreviations and their replacements
abbreviations = {
    '(bspw.': '(beispielsweise',
    '(zB.': '(zum Beispiel',
    '4Std .': '4 Std',
    'Abt.': 'Abteilung',
    'Apr.': 'April',
    'Aug.': 'August',
    'AugenaerztinDr.': 'Augenaerztin Dr.',
    'bds.': 'bds',
    'besch.': 'besch',
    'Bsp.': 'Beispiel',
    'bspw.': 'beispielsweise',
    'bzgl.': 'bezüglich',
    'bzw.': 'beziehungsweise',
    'ca.': 'circa',
    'Ca.': 'Circa',
    'Chir.': 'Chirurgisch',
    'chirurg.': 'chirurgisch',
    'dgl.': 'dergleichen',
    'd. h.': 'das heißt',
    'd.h.': 'das heißt',
    'Dez.': 'Dezember',
    'Di.': 'Dienstag',
    'diensth.': 'diensthabend',
    'div.': 'diverse',
    'Do.': 'Donnerstag',
    'dtl.': 'dtl',
    'elektr.': 'elektrisch',
    'Entspr.': 'Entsprechend',
    'ect.': 'etc',
    'etc.': 'etc',
    'evtl.': 'eventuell',
    'Fa. 2Med.': 'Fa 2Med',
    'Feb.': 'Februar',
    'Fl.': 'Flasche',
    'Fr.': 'Freitag',
    'frdl.': 'freundlich',
    'freundl.': 'freundliche',
    'ggf.': 'gegebenenfalls',
    'i. B. a.': 'in Bezug auf',
    'i. o.': 'in Ordnung',
    'i. O.': 'in Ordnung',
    'i.B.a.': 'in Bezug auf',
    'i.O.': 'in Ordnung',
    'incl.': 'inclusive',
    'inkl.': 'inklusive',
    'Jan.': 'Januar',
    'Jul.': 'Juli',
    'Jun.': 'Juni',
    'kl.': 'kleine',
    'KollegenDr.': 'Kollegen Dr.',
    'künstl.': 'künstliche',
    'li.': 'linken',
    'Lj.': 'Lj',
    'Lpz.': 'Leipzig',
    'lt.': 'laut',
    'Mai': 'Mai',
    'März': 'März',
    'med.': 'medizinisch',
    'med./pfleg.': 'medizinisch/pflege',
    'mediz.': 'medizinisch',
    'Mi.': 'Mittwoch',
    'min.': 'minute',
    'Min.': 'Minute',
    'Mo.': 'Montag',
    'mögl.': 'möglich',
    'MS.).' : 'MS).',
    'Nov.': 'November',
    'o. a.': 'oder anderes',
    'o. g.': 'oben genannte',
    'o. k.': 'okay',
    'o.ä.': 'oder ähnliches',
    'o.a.': 'oder anderes',
    'o.g.': 'oben genannte',
    'o.k.': 'okay',
    'oä.': 'oder ähnliches',
    'oa.': 'oder anderes',
    'Okt.': 'Oktober',
    'op.': 'OP',
    'Op.': 'OP',
    'org.': 'organisatorisch',
    'Pat.': 'Patient',
    'per.': 'per',
    'pers.': 'persönlich',
    'pfl.': 'pfl', # Review 105, row 898
    'Physioth.': 'Physiothearpeut',
    'Prof .': 'Prof.',
    'psych.': 'psychisch',
    '"psych.': '"psychisch',
    're.': 'rechts',
    'Sa.': 'Samstag',
    'Schließl.': 'Schließlich',
    'Schmerztbl.': 'Schmerztablette',
    'Sep.': 'September',
    'So.': 'Sonntag',
    'sog.': 'sogenannte',
    'spez.': 'speziell',
    'SSW.': 'Schwangerschaftswoche',
    'Stat.': 'Station',
    'Std.': 'Stunde',
    'tägl.': 'täglich',
    'teilw.': 'teilweise',
    'Tel.': 'Telefon',
    'Tlw.': 'teilweise',
    'u. a.': 'unter anderem',
    'u. s. w.': 'und so weiter',
    'u.a.': 'unter anderem',
    'u.s.w.': 'und so weiter',
    'ü.': 'über',
    'u.': 'und',
    'und.': 'und',
    'usw.': 'und so weiter',
    'versch.': 'verschieden',
    'wg.': 'wegen',
    'wiss.': 'wissenschaftlich',
    'wzb.': 'wie zum Beispiel',
    'z. B.': 'zum Beispiel',
    'z. b':'zum Beispiel',
    'z. T.': 'zum Teil',
    'z.B.': 'zum Beispiel',
    'z.b':'zum Beispiel',
    'z.T.': 'zum Teil',
    'zb.': 'zum Beispiel',
    'zB.': 'zum Beispiel',
    'Zi.': 'Zimmer',
    'zw.': 'zwischen',
}

# Special cases for sequences of punctuation marks
special_cases = {
    '!': re.compile(r'!+'),  # Match one or more exclamation marks
    '?': re.compile(r'\?+'),  # Match one or more question marks
    '.': re.compile(r'\.+')   # Match one or more dots
}

# Compile regular expressions for all abbreviations
abbreviation_patterns = {}

for abbr, replacement in abbreviations.items():
    # For abbreviations, ensure word boundaries
    abbreviation_patterns[abbr] = re.compile(rf'(?<!\S){re.escape(abbr)}(?!\S)')

# Function to replace abbreviations and punctuation sequences in text
def replace_abbreviations_and_punctuation(text, abbreviation_patterns, special_cases):
    if pd.isna(text):
        return text

    # Replace abbreviations
    for abbr, pattern in abbreviation_patterns.items():
        text = pattern.sub(abbreviations[abbr], text)

    # Replace punctuation sequences
    for punct, pattern in special_cases.items():
        text = pattern.sub(punct, text)

    return text

In [None]:
# Apply the function to the relevant columns
columns_to_process = ['patient_review', 'headline', 'pro', 'contra'] 

# Apply the function to the relevant columns
for column in columns_to_process:
    if column in reviews_df.columns:
        reviews_df[column] = reviews_df[column].apply(add_space_after_punctuation)
        reviews_df[column] = reviews_df[column].apply(
            lambda x: replace_abbreviations_and_punctuation(x, abbreviation_patterns, special_cases)
        )

'\noutput_path = "updated_dataset.csv"\nreviews_df.to_csv(output_path, index=False)\nprint(f"Updated dataset saved to {output_path}")\n'

In [8]:
# German months for exception handling
german_months = [
    "Januar", "Februar", "März", "April", "Mai", "Juni",
    "Juli", "August", "September", "Oktober", "November", "Dezember"
]

# Common abbreviations and titles
common_abbreviations = [
    "Prof.", "Dr.", "Med.", "St.", "ST.", "OA", "Herr", "Herrn", "Frau", "Z.st.", "GG."
]

# Preprocessing function to normalize exceptions
def preprocess_exceptions(text):
    if not text:
        return text

    # 1. Numbers followed by a dot (e.g., 1., 2., 13.)
    text = re.sub(r"(\d+)\.", r"\1#DOT#", text)

    # 2. Dates with German months (e.g., "2. Dezember")
    month_pattern = r"(\d+)\.\s*(" + "|".join(german_months) + r")"
    text = re.sub(month_pattern, r"\1#DOT# \2", text)

    # 3. Titles and abbreviations (e.g., "Prof.", "Dr.", "Herr", "Herrn", "Frau")
    abbrev_pattern = r"\b(" + "|".join(map(re.escape, common_abbreviations)) + r")"
    text = re.sub(abbrev_pattern, lambda m: m.group(1).replace(".", "#DOT#"), text)

    # 4. Single letters followed by a dot (e.g., "T.")
    text = re.sub(r"\b([A-Za-z])\.", r"\1#DOT#", text)

    # 5. Punctuation marks followed by closing parenthesis (e.g., !), ?))
    text = re.sub(r"([!?])\)", r"\1#PAREN#", text)

    return text

# Postprocessing function to restore normalized exceptions
def postprocess_exceptions(text):
    if not text:
        return text
    text = text.replace("#DOT#", ".")
    text = text.replace("#PAREN#", ")")
    return text

# Custom sentence tokenizer function
def custom_sentence_tokenizer(text):
    if not text:
        return []

    # Preprocess text to handle exceptions
    text = preprocess_exceptions(text)

    # Define a simple pattern to split sentences
    split_pattern = r"(?<=[.!?])\s+"  # Sentence-ending punctuation followed by whitespace

    # Compile the tokenizer
    tokenizer = RegexpTokenizer(split_pattern, gaps=True)

    # Tokenize and postprocess each sentence
    sentences = tokenizer.tokenize(text)
    sentences = [postprocess_exceptions(sentence) for sentence in sentences]

    return sentences

# Function to split text into sentences and create a new dataset
def create_sentence_dataset(df, column_name):
    sentence_data = []
    for _, row in df.iterrows():
        review_id = row['review_id']
        text = row[column_name]
        if pd.notna(text):  # Check if the text is not NaN
            sentences = custom_sentence_tokenizer(str(text))
            for sentence in sentences:
                sentence_data.append({"review_id": review_id, "sentence": sentence})
    return pd.DataFrame(sentence_data)

In [9]:
# Function to check if a string contains only punctuation or is empty
def is_only_punctuation_or_empty(text):
    if pd.isna(text) or text.strip() == "":
        return True
    return all(char in ".\",!?;:-()[]" for char in text.strip())

In [10]:
# Process the specific columns and create datasets
columns_to_process = ['patient_review', 'headline', 'pro', 'contra']
datasets = {}

for column in columns_to_process:
    datasets[column] = create_sentence_dataset(reviews_df, column)
    # Filter out rows that contain only punctuation
    datasets[column] = datasets[column][~datasets[column]['sentence'].apply(is_only_punctuation_or_empty)]

# Save the datasets to CSV files
for column, dataset in datasets.items():
    output_path = f"data\sentence_data\{column}_sentences.csv"
    dataset.to_csv(output_path, index=False)
    print(f"Dataset for '{column}' saved to {output_path}")

Dataset for 'patient_review' saved to data\sentence_data\patient_review_sentences.csv
Dataset for 'headline' saved to data\sentence_data\headline_sentences.csv
Dataset for 'pro' saved to data\sentence_data\pro_sentences.csv
Dataset for 'contra' saved to data\sentence_data\contra_sentences.csv


In [11]:
patient_review_df = pd.read_csv("data\sentence_data\patient_review_sentences.csv", sep=",")

# Count how many comments each author wrote
counts = patient_review_df['review_id'].value_counts()

# Identify authors who wrote more than one comment
reviews_split = counts[counts > 1]

# Display authors who wrote more than one comment
print("No. of split reviews:", reviews_split.count())
print("\nreviews that were split:")
print(reviews_split)


No. of split reviews: 456

reviews that were split:
reviewID
365    36
288    33
309    31
369    30
363    27
       ..
302     2
3       2
137     2
107     2
144     2
Name: count, Length: 456, dtype: int64


In [12]:
patient_review_df[patient_review_df['review_id'] == 365]

Unnamed: 0,reviewID,sentence
3445,365,schon bei der terminvergabe wurde mir von der ...
3446,365,so bekam ich drei wochen später (anfang Oktobe...
3447,365,"am tag der aufnahme eine kurze begutachtung, b..."
3448,365,dann kam der feiertag und das wochenende.
3449,365,auch der montag war ohne behandlung vorbei.
3450,365,am dienstag wurde mir gesagt es sei noch 16uhr...
3451,365,dienstag war ich dann gegen 19.30uhr auf dem h...
3452,365,ein schnelles abschlussgespräch fand vorn im e...
3453,365,mein erhöhter blutdruck wurde als nichtig abge...
3454,365,sonst war das abschlussgespräch auch nur schne...


In [13]:
reviews_df[reviews_df['review_id'] == 156]

Unnamed: 0,reviewID,datePublished,ratingValue,headline,for_hospital_rating,department,author,role,year_of_stay,recommends_hospital,...,administration_and_processes_stars,administration_and_processes,equipment_and_design_stars,equipment_and_design,pro,contra,disease,private_insurance,patient_review,no_comments
155,156,2019-10-21,1,geht es hier überhaupt noch um Patienten?,True,Augen,Hiero02,Patient,2019.0,False,...,2,"weniger zufrieden (umständlich, wahnsinniger P...",2.0,weniger zufrieden (die Uni sollte ja auf dem h...,Möglichkeit alles zu untersuchen,man müsste Intresse dazu haben,Katarakt-OP,False,"zum 2. mal eine Katarakt-OP. Das 1. Mal, ambul...",0


In [14]:
reviews_df[reviews_df['author'] == 'Haus7']

Unnamed: 0,reviewID,datePublished,ratingValue,headline,for_hospital_rating,department,author,role,year_of_stay,recommends_hospital,...,administration_and_processes_stars,administration_and_processes,equipment_and_design_stars,equipment_and_design,pro,contra,disease,private_insurance,patient_review,no_comments
187,188,2019-02-12,4,Danke an die Station B2.1.,True,Chirurgie,Haus7,Patient,2019.0,True,...,3,zufrieden (Macht einen Chaotischen Eindruck),3.0,zufrieden (Man ist in einem Krankenhaus nicht ...,Tolle und kompetente Ärzteinnen und Pflegerinnen,"Schlechte Küche, aber es is wenigstens ab und ...",Hüft Prothese Wechsel nach 13 Jahren,False,Anwesenheit vom 28.01.19 Es ist Sonntag und wi...,1
