In [17]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import inflect

In [74]:
def clean_text(input_file, output_file, columns): 
    """ 
    Clean text data in a CSV file by removing surrounding quotes and filling missing values.
    :param input_file: path to the input CSV file
    :param output_file: path to save the cleaned CSV file
    :param columns: list of columns to be cleaned
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Filter rows where the language is English
    df = df[df['language'] == 'en']

    # Function to clean text and remove surrounding quotes
    def clean_text(text):
        text = str(text)
        tokens = word_tokenize(text)
        
        # Remove surrounding quotes from each token
        cleaned_tokens = [re.sub(r'^"|"$', '', token) for token in tokens]
    
        cleaned_text = ' '.join(cleaned_tokens)
        return cleaned_text

    # Clean and strip quotes from the respective columns 
    for column in columns:
        df[column] = df[column].apply(clean_text)
   
    # Fill missing values with an empty string
    df.fillna('', inplace=True)

    # Save the cleaned DataFrame to a new CSV file with UTF-8 encoding
    df.to_csv(output_file, index=False)
    # print(f'Cleaned CSV file saved to {cleaned_file_path} successfully.')


In [76]:
clean_text('raw_data/topics.csv', 'raw_data/cleaned_topics.csv', ['title', 'description'])
clean_text('raw_data/content.csv', 'raw_data/cleaned_content.csv', ['title', 'description','text'])

In [80]:
# Cleaning up correlations to ensure referential integreity and removing the list 
# one topic id has to associate with a single contend id not a list of content ids 

correl = pd.read_csv('raw_data/correlations.csv')
correl['content_ids'] = correl['content_ids'].str.split()

# Explode the lists to create a row for each content_id
correl = correl.explode('content_ids')

# Rename the columns for clarity
correl = correl.rename(columns={'content_ids': 'content_id'})
correl.to_csv('raw_data/cleaned_correlations.csv', index=False)



In [40]:
def preprocess_text(text):
    """
    :param text: a string of text to be preprocessed
    """
    # Convert to string and handle NaN
    text = str(text)
    
    # Initialize the inflect engine
    p = inflect.engine()
    
    # Function to convert numbers to words
    def number_to_words(match):
        return p.number_to_words(match.group(0))
    
    # Remove non-UTF characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Remove unwanted symbols and punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits 
    text = re.sub(r'\d+', number_to_words, text)

    # Convert text to lower case 
    text = text.lower()  
    
    return text

In [41]:
# Fix copyright_holder information so that it is clean 
holder_df = pd.read_csv('raw_data/cleaned_content.csv')
print(len(holder_df['copyright_holder'].unique().tolist()))

holder_df['copyright_holder'] = holder_df['copyright_holder'].apply(preprocess_text)
holder_df.to_csv('raw_data/cleaned_content_final.csv', index=False)

115


In [39]:
# Double check 
len(holder_df['copyright_holder'].unique().tolist())

114