In [11]:
import json
import re
import random
from nltk.tokenize import word_tokenize
import nltk
import spacy
from tqdm import tqdm

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [12]:
# Load spaCy model for better topic detection
try:
    nlp = spacy.load('en_core_web_md')
    print("Loaded spaCy model successfully")
except:
    print("Downloading spaCy model (this may take a moment)...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_md"])
    nlp = spacy.load('en_core_web_md')

# Define science/health/nutrition related terms
TOPIC_KEYWORDS = {
    'science': [
        'science', 'research', 'experiment', 'theory', 'scientific', 'scientist', 'study', 
        'laboratory', 'evidence', 'hypothesis', 'analysis', 'discovery', 'biology', 'physics', 
        'chemistry', 'astronomy', 'geology', 'technology', 'innovation', 'engineering'
    ],
    'health': [
        'health', 'medical', 'medicine', 'disease', 'doctor', 'patient', 'treatment', 'cure', 
        'symptoms', 'diagnosis', 'therapy', 'wellness', 'illness', 'healthcare', 'hospital', 
        'clinic', 'recovery', 'prevention', 'condition', 'immune', 'surgery', 'mental health',
        'physical', 'wellbeing', 'disease', 'cancer', 'heart', 'brain', 'lungs'
    ],
    'nutrition': [
        'nutrition', 'diet', 'food', 'healthy eating', 'nutrient', 'vitamin', 'mineral', 
        'protein', 'carbohydrate', 'fat', 'calorie', 'metabolism', 'digestion', 'meal', 
        'vegetable', 'fruit', 'meat', 'dairy', 'weight', 'obesity', 'sugar', 'organic', 
        'supplement', 'fiber', 'hydration', 'fasting', 'macronutrient'
    ]
}

# Flatten the keywords list for easier checking
ALL_KEYWORDS = set()
for category_keywords in TOPIC_KEYWORDS.values():
    ALL_KEYWORDS.update(category_keywords)

Loaded spaCy model successfully


In [13]:
def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    # Replace newlines and multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters except punctuation
    text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
    return text.strip()

def count_words(text):
    """Count words in text"""
    if not text:
        return 0
    words = word_tokenize(text)
    return len(words)

def is_relevant_topic(text, question):
    """
    Check if the text is relevant to science, health, or nutrition
    using both keyword matching and spaCy similarity
    """
    if not text:
        return False
    
    # Clean and lowercase the combined text
    combined_text = (text + " " + question).lower()
    
    # Direct keyword matching
    for keyword in ALL_KEYWORDS:
        if keyword.lower() in combined_text:
            return True
    
    # Use spaCy for semantic similarity
    doc = nlp(combined_text)
    
    # Check similarity with topic keywords
    for category, keywords in TOPIC_KEYWORDS.items():
        for keyword in keywords:
            keyword_doc = nlp(keyword)
            # Check if any token in the document is similar to the keyword
            for token in doc:
                if token.vector_norm and keyword_doc[0].vector_norm:  # Check if vectors exist
                    similarity = token.similarity(keyword_doc[0])
                    if similarity > 0.75:  # Threshold for similarity
                        return True
    
    return False

def filter_dataset(input_file, output_file, min_words=100, max_words=120, max_entries=1000):
    """
    Filter JSONL dataset to get entries related to science, health, and nutrition
    with word count between min_words and max_words
    """
    # Read the input file and collect valid entries
    valid_entries = []
    line_count = 0
    
    # First pass: count total lines for progress bar
    with open(input_file, 'r', encoding='utf-8') as f:
        for _ in f:
            line_count += 1
    
    print(f"Processing {line_count} entries from {input_file}...")
    
    # Second pass: process entries
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=line_count, desc="Filtering entries"):
            try:
                entry = json.loads(line)
                
                # Clean the text
                answer = clean_text(entry.get('answer', ''))
                question = clean_text(entry.get('question', ''))
                
                # Check word count
                word_count = count_words(answer)
                
                # Check if the entry meets our criteria
                if (min_words <= word_count <= max_words and 
                    is_relevant_topic(answer, question)):
                    
                    # Add the entry along with topic and word count info
                    valid_entries.append({
                        'question': question,
                        'answer': answer,
                        'word_count': word_count
                    })
                    
                    # Print progress periodically
                    if len(valid_entries) % 100 == 0:
                        print(f"Found {len(valid_entries)} valid entries so far")
                        
            except json.JSONDecodeError:
                continue
            except Exception as e:
                print(f"Error processing entry: {e}")
                continue
    
    print(f"Found {len(valid_entries)} entries matching criteria")
    
    # If we have more entries than needed, randomly sample
    if len(valid_entries) > max_entries:
        print(f"Randomly sampling {max_entries} entries from {len(valid_entries)} valid entries")
        valid_entries = random.sample(valid_entries, max_entries)
    
    # Write the filtered entries to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in valid_entries:
            f.write(json.dumps(entry) + '\n')
    
    print(f"Saved {len(valid_entries)} entries to {output_file}")
    
    # Print some statistics
    word_counts = [entry['word_count'] for entry in valid_entries]
    if word_counts:
        avg_word_count = sum(word_counts) / len(word_counts)
        print(f"Average word count: {avg_word_count:.2f}")
        print(f"Min word count: {min(word_counts)}")
        print(f"Max word count: {max(word_counts)}")

In [16]:
# Function to convert JSONL to CSV
def convert_to_csv(jsonl_file, csv_file):
    """Convert a JSONL file to CSV format"""
    import csv
    import pandas as pd
    
    # Read JSONL into a list of dictionaries
    data = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Removing the question column
    df = df.drop(columns=['question'])
    
    # Save to CSV
    df.to_csv(csv_file, index=False, quoting=csv.QUOTE_ALL)
    print(f"Converted {len(data)} entries to CSV format")
    print(f"Saved to {csv_file}")
    
    return df

In [None]:
# Example usage in a Jupyter Notebook

# Configuration parameters
input_file = "Quora.jsonl"  # Replace with your input file path
output_file = "filtered_quora.jsonl"  # Replace with desired output file path
min_words = 100  # Minimum word count
max_words = 120  # Maximum word count
max_entries = 1000  # Maximum number of entries to collect

# Run the filtering process
filter_dataset(
    input_file=input_file,
    output_file=output_file,
    min_words=min_words,
    max_words=max_words,
    max_entries=max_entries
)


Processing 56402 entries from Quora.jsonl...


Filtering entries:   5%|▍         | 2619/56402 [00:34<08:13, 108.91it/s]

Found 100 valid entries so far


Filtering entries:   8%|▊         | 4561/56402 [01:02<09:28, 91.20it/s] 

Found 200 valid entries so far


Filtering entries:  12%|█▏        | 6842/56402 [01:24<07:35, 108.78it/s]

Found 300 valid entries so far


Filtering entries:  16%|█▌        | 9036/56402 [01:59<04:17, 184.23it/s]

Found 400 valid entries so far


Filtering entries:  19%|█▉        | 10889/56402 [02:22<10:18, 73.54it/s] 

Found 500 valid entries so far


Filtering entries:  23%|██▎       | 12887/56402 [02:57<07:54, 91.79it/s] 

Found 600 valid entries so far


Filtering entries:  26%|██▌       | 14630/56402 [03:30<16:20, 42.62it/s] 

Found 700 valid entries so far


Filtering entries:  29%|██▉       | 16570/56402 [04:04<09:08, 72.64it/s] 

Found 800 valid entries so far


Filtering entries:  33%|███▎      | 18729/56402 [04:44<09:45, 64.38it/s] 

Found 900 valid entries so far


Filtering entries:  37%|███▋      | 20741/56402 [05:13<08:50, 67.20it/s] 

Found 1000 valid entries so far


Filtering entries:  40%|████      | 22784/56402 [05:40<05:18, 105.57it/s]

Found 1100 valid entries so far


Filtering entries:  44%|████▍     | 24760/56402 [06:15<04:00, 131.66it/s]

Found 1200 valid entries so far


Filtering entries:  47%|████▋     | 26527/56402 [06:52<08:31, 58.41it/s] 

Found 1300 valid entries so far


Filtering entries:  50%|█████     | 28280/56402 [07:15<03:46, 124.00it/s]

Found 1400 valid entries so far


Filtering entries:  54%|█████▎    | 30259/56402 [07:43<02:59, 145.56it/s]

Found 1500 valid entries so far


Filtering entries:  57%|█████▋    | 32396/56402 [08:17<05:27, 73.32it/s] 

Found 1600 valid entries so far


Filtering entries:  61%|██████    | 34330/56402 [08:46<03:38, 101.08it/s]

Found 1700 valid entries so far


Filtering entries:  65%|██████▍   | 36439/56402 [09:19<02:11, 152.22it/s]

Found 1800 valid entries so far


Filtering entries:  68%|██████▊   | 38394/56402 [09:41<02:51, 104.91it/s]

Found 1900 valid entries so far


Filtering entries:  72%|███████▏  | 40418/56402 [10:13<05:46, 46.11it/s] 

Found 2000 valid entries so far


Filtering entries:  76%|███████▌  | 42834/56402 [10:52<05:17, 42.74it/s] 

Found 2100 valid entries so far


Filtering entries:  80%|███████▉  | 44972/56402 [11:23<02:36, 73.07it/s] 

Found 2200 valid entries so far


Filtering entries:  83%|████████▎ | 47005/56402 [12:00<02:30, 62.26it/s] 

Found 2300 valid entries so far


Filtering entries:  87%|████████▋ | 48911/56402 [12:27<00:38, 192.38it/s]

Found 2400 valid entries so far


Filtering entries:  90%|█████████ | 50934/56402 [12:57<01:06, 82.59it/s] 

Found 2500 valid entries so far


Filtering entries:  94%|█████████▎| 52738/56402 [13:25<00:20, 175.59it/s]

Found 2600 valid entries so far


Filtering entries:  98%|█████████▊| 55097/56402 [14:09<00:16, 79.16it/s] 

Found 2700 valid entries so far


Filtering entries: 100%|██████████| 56402/56402 [14:30<00:00, 64.78it/s] 

Found 2760 entries matching criteria
Randomly sampling 1000 entries from 2760 valid entries
Saved 1000 entries to filtered_quora.jsonl
Average word count: 109.92
Min word count: 100
Max word count: 120
Converted 1000 entries to CSV format
Saved to filtered_quora.csv





In [17]:
csv_df = convert_to_csv(output_file, "filtered_quora.csv")

Converted 1000 entries to CSV format
Saved to filtered_quora.csv
