In [None]:
import json
import re
import nltk
from nltk.tokenize import word_tokenize
import spacy
from tqdm import tqdm

# Download NLTK resources
nltk.download('punkt')


In [None]:

# Load spaCy model for better topic detection
try:
    nlp = spacy.load('en_core_web_md')
    print("Loaded spaCy model successfully")
except:
    print("Downloading spaCy model (this may take a moment)...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_md"])
    nlp = spacy.load('en_core_web_md')

# Define science/health/nutrition related terms
TOPIC_KEYWORDS = {
    'science': [
        'science', 'research', 'experiment', 'theory', 'scientific', 'scientist', 'study', 
        'laboratory', 'evidence', 'hypothesis', 'analysis', 'discovery', 'biology', 'physics', 
        'chemistry', 'astronomy', 'geology', 'technology', 'innovation', 'engineering'
    ],
    'health': [
        'health', 'medical', 'medicine', 'disease', 'doctor', 'patient', 'treatment', 'cure', 
        'symptoms', 'diagnosis', 'therapy', 'wellness', 'illness', 'healthcare', 'hospital', 
        'clinic', 'recovery', 'prevention', 'condition', 'immune', 'surgery', 'mental health',
        'physical', 'wellbeing', 'disease', 'cancer', 'heart', 'brain', 'lungs'
    ],
    'nutrition': [
        'nutrition', 'diet', 'food', 'healthy eating', 'nutrient', 'vitamin', 'mineral', 
        'protein', 'carbohydrate', 'fat', 'calorie', 'metabolism', 'digestion', 'meal', 
        'vegetable', 'fruit', 'meat', 'dairy', 'weight', 'obesity', 'sugar', 'organic', 
        'supplement', 'fiber', 'hydration', 'fasting', 'macronutrient'
    ]
}

# Flatten the keywords list for easier checking
ALL_KEYWORDS = set()
for category_keywords in TOPIC_KEYWORDS.values():
    ALL_KEYWORDS.update(category_keywords)

In [None]:

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    # Replace newlines and multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters except punctuation
    text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
    return text.strip()

def count_words(text):
    """Count words in text"""
    if not text:
        return 0
    words = word_tokenize(text)
    return len(words)

def is_relevant_topic(text, title):
    """
    Check if the text is relevant to science, health, or nutrition
    using both keyword matching and spaCy similarity
    """
    if not text:
        return False
    
    # Clean and lowercase the combined text
    combined_text = (text + " " + title).lower()
    
    # Direct keyword matching
    for keyword in ALL_KEYWORDS:
        if keyword.lower() in combined_text:
            return True
    
    # Use spaCy for semantic similarity
    doc = nlp(combined_text)
    
    # Check similarity with topic keywords
    for category, keywords in TOPIC_KEYWORDS.items():
        for keyword in keywords:
            keyword_doc = nlp(keyword)
            # Check if any token in the document is similar to the keyword
            for token in doc:
                if token.vector_norm and keyword_doc[0].vector_norm:  # Check if vectors exist
                    similarity = token.similarity(keyword_doc[0])
                    if similarity > 0.75:  # Threshold for similarity
                        return True
    
    return False

def filter_arxiv_dataset(input_file, output_file, min_words=100, max_words=120, max_entries=1000):
    """
    Filter arXiv abstracts dataset to get entries related to science, health, and nutrition
    with word count between min_words and max_words, taking the first max_entries that match.
    """
    # Read the input file and collect valid entries
    valid_entries = []
    line_count = 0
    
    # First pass: count total lines for progress bar
    with open(input_file, 'r', encoding='utf-8') as f:
        for _ in f:
            line_count += 1
    
    print(f"Processing {line_count} entries from {input_file}...")
    
    # Second pass: process entries
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for line in tqdm(f, total=line_count, desc="Filtering entries"):
            try:
                entry = json.loads(line)
                
                # Get abstract and title
                abstract = clean_text(entry.get('abstract', ''))
                title = clean_text(entry.get('title', ''))
                
                # Check word count
                word_count = count_words(abstract)
                
                # Check if the entry meets our criteria
                if (min_words <= word_count <= max_words and 
                    is_relevant_topic(abstract, title)):
                    
                    # Create a new entry with the fields we want to keep
                    filtered_entry = {
                        'id': entry.get('id', ''),
                        'title': title,
                        'abstract': abstract,
                        'authors': entry.get('authors', ''),
                        'categories': entry.get('categories', []),
                        'word_count': word_count
                    }
                    
                    # Write directly to output file
                    out_f.write(json.dumps(filtered_entry) + '\n')
                    
                    valid_entries.append(filtered_entry)
                    
                    # Print progress periodically
                    if len(valid_entries) % 100 == 0:
                        print(f"Found {len(valid_entries)} valid entries so far")
                    
                    # Stop if we've reached the desired number of entries
                    if len(valid_entries) >= max_entries:
                        print(f"Reached {max_entries} entries, stopping.")
                        break
                        
            except json.JSONDecodeError:
                continue
            except Exception as e:
                print(f"Error processing entry: {e}")
                continue
    
    print(f"Found and saved {len(valid_entries)} entries matching criteria to {output_file}")
    
    # Print some statistics
    word_counts = [entry['word_count'] for entry in valid_entries]
    if word_counts:
        avg_word_count = sum(word_counts) / len(word_counts)
        print(f"Average word count: {avg_word_count:.2f}")
        print(f"Min word count: {min(word_counts)}")
        print(f"Max word count: {max(word_counts)}")
    
    return valid_entries

In [None]:
# Configuration parameters
input_file = "arxiv-abstracts.jsonl"  # Replace with your input file path
output_file = "filtered_arxiv_dataset.jsonl"  # Replace with desired output file path
min_words = 100  # Minimum word count
max_words = 120  # Maximum word count
max_entries = 1000  # Maximum number of entries to collect

def run_filter():
    """Run the filtering process with the configured parameters"""
    return filter_arxiv_dataset(
        input_file=input_file,
        output_file=output_file,
        min_words=min_words,
        max_words=max_words,
        max_entries=max_entries
    )


# To analyze word count distribution
def analyze_word_counts(jsonl_file):
    """Analyze word count distribution in the filtered dataset"""
    import matplotlib.pyplot as plt
    import pandas as pd
    
    word_counts = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            word_counts.append(entry['word_count'])
    
    # Create DataFrame for analysis
    df = pd.DataFrame({'word_count': word_counts})
    
    # Display statistics
    stats = df.describe()
    print("Word Count Statistics:")
    print(stats)
    
    # Plot histogram
    plt.figure(figsize=(10, 6))
    plt.hist(word_counts, bins=20, alpha=0.7)
    plt.title('Word Count Distribution')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.grid(alpha=0.3)
    plt.show()
    
    return df

# Function to convert JSONL to CSV
def convert_to_csv(jsonl_file, csv_file):
    """Convert a JSONL file to CSV format"""
    import csv
    import pandas as pd
    
    # Read JSONL into a list of dictionaries
    data = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Removing all columns except abstract and word_count
    df = df[['abstract', 'word_count']]
    
    # Save to CSV
    df.to_csv(csv_file, index=False, quoting=csv.QUOTE_ALL)
    print(f"Converted {len(data)} entries to CSV format")
    print(f"Saved to {csv_file}")
    
    return df



In [None]:
# Example usage:
# Run the filter
filtered_data = run_filter()


In [None]:
# Analyze word counts
# word_count_data = analyze_word_counts(output_file)

# Convert to CSV
csv_df = convert_to_csv(output_file, "filtered_abstracts.csv")