In [43]:
import pandas as pd
import re

In [56]:
df = pd.read_csv('scraped_data.csv')

In [58]:
## Data Cleaning
def remove_unwanted_text(df, unwanted_text):
   df['text'] = df['text'].apply(lambda x: x.replace(unwanted_text, '', 1) if x.starstswith(unwanted_text) else x)
   return df

def search_and_remove_patterns(df, pattern_list, verbose=False):
    """
    This function searches and removes patterns from the pattern list, 
    showing you which matches and where have they been found,
    also telling you how many matches have been replaced by ''.
    """
    words_removed = []
    for i, text in enumerate(df['text']):
        if verbose: print(f'\n\n**************Text {i}****************')
        matched_search = True
        while matched_search is True:
            matched_search = False
            for pattern in pattern_list:
                result = re.search(pattern, df.loc[i, 'text'], flags=0)
                if result!=None:
                    if verbose: print(f'match: {result[0]}')
                    matched_search = True
                    words_removed.append(result[0])
                    df.loc[i, 'text'] = df.loc[i, 'text'].replace(result[0], '').strip() # strip removes whitespaces accumulated at the start/end of the string
    print(f'{len(words_removed)} words were removed ({len(set(words_removed))} different words): \n{set(words_removed)}')
    return df


def preprocess_text(text):
    """
    With thisfunction we remove PDF files [r'\(PDF, \d+\.\d* KB\)', r'\(PDF, \d+\.\d* MB\)', r'\(PDF\)'], tags r'\[\d+\]', and excessive whitespace r'\s+'.
    """
    pdf_pattern_list = [r'\(PDF, \d+\.\d* KB\)', r'\(PDF, \d+\.\d* MB\)', r'\(PDF\)']
    # Remove PDF files
    for pattern in pdf_pattern_list:
        text = re.sub(pattern, '', text)
    # Remove reference tags
    text = re.sub(r'\[\d+\]', '', text)
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def split_into_sections(text, max_length=1600):
    # Split the text into paragraphs
    paragraphs = text.split('\n')
    sections = []
    section = ''
    for paragraph in paragraphs:
        # If adding the next paragraph doesn't exceed the maximum length,
        # add the paragraph to the current section
        if len(section) + len(paragraph) < max_length:
            section += paragraph
        else:
            # If it does, start a new section
            sections.append(section)
            section = paragraph
    # Don't forget to add the last section
    if section:
        sections.append(section)
    return sections

def post_process(df):
    # Clean up the text
    df['text'] = df['text'].apply(preprocess_text)
    # Split each article into sections
    df['sections'] = df['text'].apply(split_into_sections)
    # Prepend titles to each section's text
    df['sections'] = df.apply(lambda row: [f"{row['title']}\n{section}" for section in row['sections']], axis=1)
    # Unroll the sections into separate rows
    df = df.explode('sections')
    df['text'] = df['sections']
    df.drop(columns=['sections'], inplace=True)
    return df

df1 = post_process(df)

In [60]:
search_and_remove_patterns(df1, [r'\(PDF, \d+\.\d* KB\)', r'\(PDF, \d+\.\d* MB\)', r'\(PDF\)'], verbose=True)



**************Text 0****************


TypeError: expected string or bytes-like object, got 'Series'