In [3]:
import os
import json
import re
import csv 
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import pandas as pd

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    """Map Treebank POS tags to WordNet POS tags for accurate lemmatization."""
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None
        
def preprocess_text(text):
    # 1. Clean the text: remove HTML tags, special characters, and numbers
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 2. Tokenization
    tokenizer = RegexpTokenizer(r'\b[a-zA-Z-]{2,}\b')  # Only keep words with ≥ 2 letters
    tokens = tokenizer.tokenize(text.lower())

    # 3. Remove 'reuters' tokens (case-insensitive)
    tokens = [word for word in tokens if word != 'reuters']

    # 4. Remove custom stop words (prepositions) and standard stop words
    stop_words = set(stopwords.words('english'))
    custom_prepositions = {'above', 'across', 'after', 'against', 'along', 'among', 
                           'around', 'at', 'before', 'behind', 'below', 'beneath', 
                           'beside', 'between', 'beyond', 'by', 'despite', 'down', 
                           'except', 'for', 'from', 'in', 'inside', 'into', 'near', 
                           'of', 'off', 'on', 'onto', 'out', 'over', 'past', 
                           'regarding', 'round', 'since', 'through', 'throughout', 
                           'till', 'to', 'toward', 'under', 'underneath', 'until', 
                           'unto', 'up', 'upon', 'with', 'within', 'without'}

    custom_stop_words = {  
        'china', 'usa', 'us', 'america', 'american', 'americans', 'chinese', 'russia', 
        'russian', 'putin', 'vladimir', 'trump', 'donald', 'biden', 'joe', 'ukraine', 
        'ukrainian', 'ukrainians', 'ukraines', 'say', 'jan', 'feb', 'mar', 'apr', 'may', 
        'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'mon', 'tue', 'wed', 'thu', 'fri', 
        'sat', 'sun', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 
        'sunday', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 
        'september', 'october', 'november', 'december', 'today', 'yesterday', 'tomorrow', 
        'week', 'month', 'year', 'time', 'day', 'weekend', 'morning', 'afternoon', 
        'evening', 'night', 'news', 'new'
    }

    stop_words.update(custom_prepositions)
    stop_words.update(custom_stop_words)
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Remove words longer than 10 letters
    tokens = [word for word in tokens if len(word) <= 10]

    # 6. POS tagging and lemmatization for accurate tense restoration
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            lemmatized = lemmatizer.lemmatize(word, wn_tag)
        else:
            lemmatized = lemmatizer.lemmatize(word)
        lemmatized_tokens.append(lemmatized)

    # 7. Remove specific words (performed after lemmatization)
    words_to_remove = {'licensing', 'right', 'thomson', 'trust', 'tabsuggested', 
                   'principle', 'open', 'new', 'standard', 'say', 'co', 'ltd'}
    words_to_remove.update(['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 
                        'nine', 'ten', 'hundred', 'thousand', 'million', 'billion', 'trillion'])

    lemmatized_tokens = [word for word in lemmatized_tokens if word not in words_to_remove]



    #8. Remove single-letter words
    lemmatized_tokens = [word for word in lemmatized_tokens if len(word) >= 2]

    return lemmatized_tokens
    
def process_articles(input_directory, output_directory):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    articles = []  # List to store processed articles

    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    article = json.load(f)

                # Check if the article is in English
                if not article.get('title') or not article.get('body'):
                    continue

                # Preprocess the title and body
                article['title'] = preprocess_text(article['title'])
                article['body'] = preprocess_text(article['body'])

                # Check if the article is empty
                if not article['title'] or not article['body']:
                    continue

                # Check for duplicates
                if article['title'] in [a['title'] for a in articles]:
                    continue

                # Add the date field
                article['date'] = article.get('date', 'Unknown')  # Use 'Unknown' if date is missing

                articles.append(article)  # Add the processed article to the list

    # Convert the list of articles to a DataFrame
    df = pd.DataFrame(articles)

    # New step: Convert the body field's token list into a space-delimited string
    df['body'] = df['body'].apply(lambda x: ', '.join(x))  # Key modification: space -> comma
    df['title'] = df['title'].apply(lambda x: ', '.join(x))  # The title needs to be processed as well

    # Keep only the title, date, and body columns
    df = df[['title', 'date', 'body']]

    # Build the output file path
    output_path = os.path.join(output_directory, "processed_articles.csv")

    # Save the processed data to a CSV file
    df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL)  

    print(f"Processed data saved to CSV file: {output_path}")
    
if __name__ == "__main__":
    input_directory = "articles"  # Path to the directory containing the JSON files
    output_directory = "processed_articles"  # Path to the output directory
    process_articles(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/maojialu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maojialu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maojialu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/maojialu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Processed data saved to CSV file: processed_articles/processed_articles.csv
