# Imports

In [1]:
import re

import pandas as pd
from git_root import git_root

In [2]:
my_git_root = git_root()

In [3]:
df_chunked_path = f'{my_git_root}/data/output/documents.csv'
df_chunked = pd.read_csv(df_chunked_path)

# Clean step by step

Remove URLs

In [4]:
def remove_urls(text):
    return re.sub(r'https?:\/\/[^\s]+', ' ', text)

df_chunked['text'] = df_chunked['text'].apply(remove_urls)

Remove markdown images

In [5]:
def remove_images(text):
    return re.sub(r'!\[.*\]\(.*\)', '', text)

df_chunked['text'] = df_chunked['text'].apply(remove_images)

Remove tables

In [6]:
def remove_tags_in_table(match):
    table_content = match.group(2)
    cleaned_table_content = re.sub(r'</?td>', ' ', table_content)
    cleaned_table_content = re.sub(r"</tr><tr>", '\n', cleaned_table_content)
    cleaned_table_content = re.sub(r"</?tr>", '', cleaned_table_content)
    cleaned_table_content = re.sub(r'<td (?:colspan|rowspan)="\d+">', '', cleaned_table_content)
    cleaned_table_content = re.sub(r' +', ' ', cleaned_table_content)
    cleaned_table_content = re.sub(r'\n ', '\n', cleaned_table_content)
    return cleaned_table_content

def has_table(text):
    return bool(re.search(r'\s*<html><body><table>.*</table></body></html>\s*', text))

def extract_table(text):
    if has_table(text):
        result = re.sub(r"(<html><body><table>)(.*?)(</table></body></html>)", remove_tags_in_table, text, flags=re.DOTALL)
        return result
    else:
        result = re.sub(r'<.*>', ' ', text)
        return result

df_chunked['text'] = df_chunked['text'].apply(extract_table)

Remove lingering whitespaces

In [7]:
def remove_whitespaces(text):
    text = re.sub(r'^\s+', '', text)
    text = re.sub(r'\s+$', '', text)
    return text

df_chunked['text'] = df_chunked['text'].apply(remove_whitespaces)

In [8]:
df_chunked.to_csv(f'{my_git_root}/data/output/documents_cleaned.csv', index=False)