In [74]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re

def clean_text(input_file, output_file, columns): 
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Filter rows where the language is English
    df = df[df['language'] == 'en']

    # Function to clean text and remove surrounding quotes
    def clean_text(text):
        # Convert to string and handle NaN
        text = str(text)
        
        # Tokenize the text to handle quotes and spaces
        tokens = word_tokenize(text)
        
        # Remove surrounding quotes from each token
        cleaned_tokens = [re.sub(r'^"|"$', '', token) for token in tokens]
        
        # Join tokens back into a single string
        cleaned_text = ' '.join(cleaned_tokens)
        
        return cleaned_text

    # Clean and strip quotes from the title and description columns
    for column in columns:
        df[column] = df[column].apply(clean_text)
   

    # Fill missing values with an empty string
    df.fillna('', inplace=True)

    # Save the cleaned DataFrame to a new CSV file with UTF-8 encoding
    df.to_csv(output_file, index=False)
    # print(f'Cleaned CSV file saved to {cleaned_file_path} successfully.')


In [76]:
clean_text('raw_data/topics.csv', 'raw_data/cleaned_topics.csv', ['title', 'description'])
clean_text('raw_data/content.csv', 'raw_data/cleaned_content.csv', ['title', 'description','text'])

In [80]:
# Cleaning up correlations to ensure referential integreity and removing the list 
# one topic id has to associate with a s ingle contend id not a list of content ids 

correl = pd.read_csv('raw_data/correlations.csv')
# Split the content_ids into lists
correl['content_ids'] = correl['content_ids'].str.split()

# Explode the lists to create a row for each content_id
correl = correl.explode('content_ids')

# Rename the columns for clarity
correl = correl.rename(columns={'content_ids': 'content_id'})
correl.to_csv('raw_data/cleaned_correlations.csv', index=False)

