In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import csv
import json
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK resources if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Define science/health/nutrition related terms
TOPIC_KEYWORDS = {
    'science': [
        'science', 'research', 'experiment', 'theory', 'scientific', 'scientist', 'study', 
        'laboratory', 'evidence', 'hypothesis', 'analysis', 'discovery', 'biology', 'physics', 
        'chemistry', 'astronomy', 'geology'],
    'health': [
        'health', 'medical', 'medicine', 'disease', 'doctor', 'patient', 'treatment', 'cure', 
        'symptoms', 'diagnosis', 'therapy', 'wellness', 'illness', 'healthcare', 'hospital', 
        'clinic', 'recovery', 'prevention', 'condition', 'immune', 'surgery', 'mental health',
        'physical', 'wellbeing', 'disease', 'cancer', 'heart', 'brain', 'lungs'
    ],
    'nutrition': [
        'nutrition', 'diet', 'food', 'healthy eating', 'nutrient', 'vitamin', 'mineral', 
        'protein', 'carbohydrate', 'fat', 'calorie', 'metabolism', 'digestion', 'meal', 
        'vegetable', 'fruit', 'meat', 'dairy', 'weight', 'obesity', 'sugar', 'organic', 
        'supplement', 'fiber', 'hydration', 'fasting', 'macronutrient'
    ]
}

# Flatten the keywords list for easier checking
ALL_KEYWORDS = set()
for category_keywords in TOPIC_KEYWORDS.values():
    ALL_KEYWORDS.update(category_keywords)

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    # Replace newlines and multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters except punctuation
    text = re.sub(r'[^\w\s.,!?;:\'\"-]', '', text)
    return text.strip()

def count_words(text):
    """Count words in text"""
    if not text:
        return 0
    words = word_tokenize(text)
    return len(words)

def is_relevant_topic(text):
    """
    Check if the text is relevant to science, health, or nutrition
    using keyword matching
    """
    if not text:
        return False
    
    # Clean and lowercase the text
    clean_text_lower = text.lower()
    
    # Direct keyword matching
    for keyword in ALL_KEYWORDS:
        if keyword.lower() in clean_text_lower:
            return True
    
    return False

def process_csv_file(input_file, output_jsonl, output_csv, min_words=100, max_words=120, 
                   max_entries=2000, encoding='utf-8'):
    """
    Process the CSV file, filter entries based on criteria, and save output
    """
    print(f"Reading file: {input_file}")
    try:
        # Read the CSV file
        df = pd.read_csv(input_file, encoding=encoding)
        print(f"Successfully loaded {len(df)} entries from {input_file}")
        print(f"Column headers: {list(df.columns)}")
    except Exception as e:
        print(f"Error reading file {input_file}: {e}")
        return []
    
    # Filter and process entries
    valid_entries = []
    
    with open(output_jsonl, 'w', encoding='utf-8') as out_f:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries"):
            try:
                # Get the text content and clean it
                content = str(row.get('content', ''))
                text = clean_text(content)
                
                # Skip if text is empty
                if not text:
                    continue
                
                # Count words (we'll calculate ourselves rather than using tokens)
                word_count = count_words(text)
                
                # Get the input query
                input_text = str(row.get('input', ''))
                input_text = clean_text(input_text)
                
                # Check if the text is relevant to our topics
                is_relevant = is_relevant_topic(text)
                
                # Check if the entry meets our criteria
                if min_words <= word_count <= max_words and is_relevant:
                    # Create a new entry with the fields we want to keep
                    filtered_entry = {
                        'id': row.get('uuid', f"entry_{idx}"),
                        'content': text,
                        'word_count': word_count,
                        'input': input_text,
                        'source': row.get('repo_name', '')
                    }
                    
                    # Write directly to output JSONL file
                    out_f.write(json.dumps(filtered_entry) + '\n')
                    
                    # Append to our valid entries list
                    valid_entries.append(filtered_entry)
                    
                    # Print progress periodically
                    if len(valid_entries) % 10 == 0:
                        print(f"Found {len(valid_entries)} valid entries so far")
                    
                    # Stop if we've reached the desired number of entries
                    if len(valid_entries) >= max_entries:
                        print(f"Reached {max_entries} entries, stopping.")
                        break
            except Exception as e:
                print(f"Error processing entry {idx}: {e}")
                continue
    
    print(f"Found and saved {len(valid_entries)} entries matching criteria to {output_jsonl}")
    
    # Convert to DataFrame and save as CSV
    if valid_entries:
        valid_df = pd.DataFrame(valid_entries)
        valid_df = valid_df[['content', 'word_count']]
        valid_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
        print(f"Converted entries saved to {output_csv}")
        
        # Print some statistics
        word_counts = [entry['word_count'] for entry in valid_entries]
        if word_counts:
            avg_word_count = sum(word_counts) / len(word_counts)
            print(f"Average word count: {avg_word_count:.2f}")
            print(f"Min word count: {min(word_counts)}")
            print(f"Max word count: {max(word_counts)}")
            
            # Word count statistics
            word_count_df = pd.DataFrame({'word_count': word_counts})
            stats = word_count_df.describe()
            print("Word Count Statistics:")
            print(stats)
            
            # Plot histogram
            plt.figure(figsize=(10, 6))
            plt.hist(word_counts, bins=20, alpha=0.7)
            plt.title('Word Count Distribution')
            plt.xlabel('Word Count')
            plt.ylabel('Frequency')
            plt.grid(alpha=0.3)
            plt.savefig('word_count_distribution.png')
            plt.close()
            print("Word count distribution plot saved to 'word_count_distribution.png'")
            
            # Source distribution if available
            if 'source' in valid_df.columns:
                source_counts = valid_df['source'].value_counts()
                print("\nSource Distribution:")
                print(source_counts)
                
                # Plot source distribution
                plt.figure(figsize=(10, 6))
                source_counts.plot(kind='bar')
                plt.title('Source Distribution')
                plt.xlabel('Source')
                plt.ylabel('Count')
                plt.grid(axis='y', alpha=0.3)
                plt.savefig('source_distribution.png')
                plt.close()
                print("Source distribution plot saved to 'source_distribution.png'")
    
    return valid_entries

if __name__ == "__main__":
    # Configuration
    input_file = "train.csv"  # CSV input file (change to your actual filename)
    output_jsonl = "filtered_train.jsonl"  # Output JSONL file
    output_csv = "filtered_train.csv"  # Output CSV file
    min_words = 100  # Minimum word count
    max_words = 120  # Maximum word count
    max_entries = 2000  # Maximum number of entries to collect
    
    # Run the processor
    filtered_data = process_csv_file(
        input_file=input_file,
        output_jsonl=output_jsonl,
        output_csv=output_csv,
        min_words=min_words,
        max_words=max_words,
        max_entries=max_entries
    )
    
    print(f"Processing complete. {len(filtered_data)} entries filtered and saved.")

Reading file: train.csv
Successfully loaded 110000 entries from train.csv
Column headers: ['uuid', 'content', 'content_tokens_len', 'input', 'prompt_tokens_len', 'reasoning_content', 'reasoning_content_tokens_len', 'repo_name', 'score']


Processing entries:   1%|          | 1233/110000 [00:03<04:22, 414.65it/s]

Found 10 valid entries so far


Processing entries:   2%|▏         | 2045/110000 [00:05<04:16, 421.31it/s]

Found 20 valid entries so far


Processing entries:   3%|▎         | 2946/110000 [00:07<04:10, 427.58it/s]

Found 30 valid entries so far


Processing entries:   4%|▍         | 4349/110000 [00:11<04:23, 401.24it/s]

Found 40 valid entries so far


Processing entries:   5%|▍         | 5059/110000 [00:13<04:47, 365.44it/s]

Found 50 valid entries so far


Processing entries:   5%|▌         | 5821/110000 [00:15<04:26, 391.55it/s]

Found 60 valid entries so far


Processing entries:   6%|▋         | 6951/110000 [00:18<03:51, 444.47it/s]

Found 70 valid entries so far


Processing entries:   7%|▋         | 7754/110000 [00:20<03:39, 465.64it/s]

Found 80 valid entries so far


Processing entries:   8%|▊         | 8637/110000 [00:22<04:02, 418.51it/s]

Found 90 valid entries so far


Processing entries:   9%|▉         | 9854/110000 [00:24<03:38, 459.29it/s]

Found 100 valid entries so far


Processing entries:  10%|▉         | 10877/110000 [00:27<03:44, 442.49it/s]

Found 110 valid entries so far


Processing entries:  11%|█         | 11804/110000 [00:29<03:39, 446.88it/s]

Found 120 valid entries so far


Processing entries:  12%|█▏        | 12889/110000 [00:31<03:35, 450.76it/s]

Found 130 valid entries so far


Processing entries:  12%|█▏        | 13738/110000 [00:33<03:15, 492.30it/s]

Found 140 valid entries so far


Processing entries:  13%|█▎        | 14745/110000 [00:36<03:55, 404.00it/s]

Found 150 valid entries so far


Processing entries:  14%|█▍        | 15886/110000 [00:39<04:17, 365.35it/s]

Found 160 valid entries so far


Processing entries:  16%|█▌        | 17203/110000 [00:42<03:43, 414.71it/s]

Found 170 valid entries so far


Processing entries:  17%|█▋        | 18420/110000 [00:45<03:16, 465.61it/s]

Found 180 valid entries so far


Processing entries:  17%|█▋        | 19039/110000 [00:46<03:20, 452.89it/s]

Found 190 valid entries so far


Processing entries:  18%|█▊        | 20109/110000 [00:49<03:23, 440.65it/s]

Found 200 valid entries so far


Processing entries:  19%|█▉        | 21119/110000 [00:51<03:30, 421.96it/s]

Found 210 valid entries so far


Processing entries:  20%|██        | 22384/110000 [00:54<03:17, 443.46it/s]

Found 220 valid entries so far


Processing entries:  21%|██        | 22987/110000 [00:55<03:26, 421.26it/s]

Found 230 valid entries so far


Processing entries:  21%|██▏       | 23588/110000 [00:57<03:17, 437.77it/s]

Found 240 valid entries so far


Processing entries:  23%|██▎       | 24978/110000 [01:00<02:42, 523.83it/s]

Found 250 valid entries so far


Processing entries:  24%|██▎       | 25898/110000 [01:02<02:51, 489.96it/s]

Found 260 valid entries so far


Processing entries:  25%|██▍       | 26974/110000 [01:04<03:10, 435.16it/s]

Found 270 valid entries so far


Processing entries:  25%|██▌       | 27651/110000 [01:06<03:01, 453.78it/s]

Found 280 valid entries so far


Processing entries:  26%|██▌       | 28182/110000 [01:07<03:12, 424.32it/s]

Found 290 valid entries so far


Processing entries:  27%|██▋       | 29404/110000 [01:10<02:52, 466.21it/s]

Found 300 valid entries so far


Processing entries:  27%|██▋       | 30242/110000 [01:12<02:43, 487.16it/s]

Found 310 valid entries so far


Processing entries:  28%|██▊       | 31195/110000 [01:14<03:03, 429.37it/s]

Found 320 valid entries so far


Processing entries:  29%|██▉       | 32149/110000 [01:16<02:38, 490.29it/s]

Found 330 valid entries so far


Processing entries:  30%|███       | 33307/110000 [01:19<02:46, 461.54it/s]

Found 340 valid entries so far


Processing entries:  31%|███       | 33827/110000 [01:20<02:56, 431.45it/s]

Found 350 valid entries so far


Processing entries:  32%|███▏      | 35072/110000 [01:23<03:06, 400.92it/s]

Found 360 valid entries so far


Processing entries:  33%|███▎      | 35779/110000 [01:25<02:46, 445.72it/s]

Found 370 valid entries so far


Processing entries:  33%|███▎      | 36654/110000 [01:26<02:26, 500.79it/s]

Found 380 valid entries so far


Processing entries:  34%|███▍      | 37717/110000 [01:29<02:30, 480.38it/s]

Found 390 valid entries so far


Processing entries:  35%|███▌      | 38933/110000 [01:31<02:31, 470.43it/s]

Found 400 valid entries so far


Processing entries:  37%|███▋      | 40363/110000 [01:34<02:17, 505.47it/s]

Found 410 valid entries so far


Processing entries:  38%|███▊      | 41467/110000 [01:37<02:34, 442.38it/s]

Found 420 valid entries so far


Processing entries:  39%|███▊      | 42526/110000 [01:39<02:21, 476.45it/s]

Found 430 valid entries so far


Processing entries:  40%|███▉      | 43652/110000 [01:41<02:15, 491.44it/s]

Found 440 valid entries so far


Processing entries:  40%|████      | 44184/110000 [01:43<02:27, 444.88it/s]

Found 450 valid entries so far


Processing entries:  41%|████      | 44775/110000 [01:44<02:29, 435.76it/s]

Found 460 valid entries so far


Processing entries:  41%|████▏     | 45635/110000 [01:46<02:46, 386.07it/s]

Found 470 valid entries so far


Processing entries:  43%|████▎     | 47016/110000 [01:49<02:05, 502.26it/s]

Found 480 valid entries so far


Processing entries:  44%|████▎     | 47876/110000 [01:51<02:16, 454.19it/s]

Found 490 valid entries so far


Processing entries:  44%|████▍     | 48707/110000 [01:53<02:06, 485.49it/s]

Found 500 valid entries so far


Processing entries:  45%|████▌     | 49650/110000 [01:55<02:16, 440.89it/s]

Found 510 valid entries so far


Processing entries:  46%|████▌     | 50181/110000 [01:56<02:05, 477.63it/s]

Found 520 valid entries so far


Processing entries:  47%|████▋     | 51431/110000 [01:59<02:17, 427.26it/s]

Found 530 valid entries so far


Processing entries:  48%|████▊     | 52267/110000 [02:00<01:51, 518.38it/s]

Found 540 valid entries so far


Processing entries:  49%|████▊     | 53396/110000 [02:03<02:05, 449.92it/s]

Found 550 valid entries so far


Processing entries:  49%|████▉     | 53927/110000 [02:04<01:54, 489.99it/s]

Found 560 valid entries so far


Processing entries:  50%|████▉     | 54941/110000 [02:06<02:04, 443.92it/s]

Found 570 valid entries so far


Processing entries:  50%|█████     | 55381/110000 [02:07<01:56, 468.22it/s]

Found 580 valid entries so far


Processing entries:  51%|█████     | 56238/110000 [02:09<01:50, 485.34it/s]

Found 590 valid entries so far


Processing entries:  52%|█████▏    | 57157/110000 [02:11<01:52, 469.45it/s]

Found 600 valid entries so far


Processing entries:  53%|█████▎    | 58172/110000 [02:13<02:02, 424.69it/s]

Found 610 valid entries so far


Processing entries:  54%|█████▍    | 59184/110000 [02:16<01:53, 448.32it/s]

Found 620 valid entries so far


Processing entries:  55%|█████▍    | 60391/110000 [02:18<01:33, 528.05it/s]

Found 630 valid entries so far


Processing entries:  56%|█████▌    | 61102/110000 [02:20<01:53, 431.22it/s]

Found 640 valid entries so far


Processing entries:  56%|█████▌    | 61868/110000 [02:21<01:37, 494.29it/s]

Found 650 valid entries so far


Processing entries:  57%|█████▋    | 62728/110000 [02:23<01:54, 413.92it/s]

Found 660 valid entries so far


Processing entries:  58%|█████▊    | 63867/110000 [02:26<01:41, 452.98it/s]

Found 670 valid entries so far


Processing entries:  59%|█████▊    | 64448/110000 [02:27<01:36, 472.99it/s]

Found 680 valid entries so far


Processing entries:  60%|█████▉    | 65507/110000 [02:29<01:29, 497.22it/s]

Found 690 valid entries so far


Processing entries:  60%|██████    | 66038/110000 [02:30<01:23, 528.66it/s]

Found 700 valid entries so far


Processing entries:  61%|██████    | 67009/110000 [02:32<01:28, 487.33it/s]

Found 710 valid entries so far


Processing entries:  62%|██████▏   | 67935/110000 [02:35<01:31, 460.50it/s]

Found 720 valid entries so far


Processing entries:  62%|██████▏   | 68516/110000 [02:36<01:26, 478.17it/s]

Found 730 valid entries so far


Processing entries:  63%|██████▎   | 69062/110000 [02:37<01:23, 489.50it/s]

Found 740 valid entries so far


Processing entries:  64%|██████▍   | 70258/110000 [02:39<01:22, 483.91it/s]

Found 750 valid entries so far


Processing entries:  65%|██████▍   | 71315/110000 [02:42<01:14, 517.06it/s]

Found 760 valid entries so far


Processing entries:  66%|██████▌   | 72335/110000 [02:44<01:20, 468.66it/s]

Found 770 valid entries so far


Processing entries:  67%|██████▋   | 73247/110000 [02:46<01:17, 472.73it/s]

Found 780 valid entries so far


Processing entries:  68%|██████▊   | 74394/110000 [02:49<01:06, 536.89it/s]

Found 790 valid entries so far


Processing entries:  69%|██████▊   | 75436/110000 [02:51<01:12, 474.50it/s]

Found 800 valid entries so far


Processing entries:  69%|██████▉   | 76399/110000 [02:53<01:12, 461.07it/s]

Found 810 valid entries so far


Processing entries:  70%|███████   | 77097/110000 [02:55<01:12, 454.98it/s]

Found 820 valid entries so far


Processing entries:  71%|███████   | 78115/110000 [02:57<01:04, 496.88it/s]

Found 830 valid entries so far


Processing entries:  72%|███████▏  | 79030/110000 [02:59<01:06, 464.33it/s]

Found 840 valid entries so far


Processing entries:  72%|███████▏  | 79631/110000 [03:00<01:05, 461.40it/s]

Found 850 valid entries so far


Processing entries:  73%|███████▎  | 80630/110000 [03:02<01:05, 447.45it/s]

Found 860 valid entries so far


Processing entries:  74%|███████▍  | 81876/110000 [03:05<01:10, 399.05it/s]

Found 870 valid entries so far


Processing entries:  75%|███████▌  | 82875/110000 [03:07<00:58, 464.39it/s]

Found 880 valid entries so far


Processing entries:  76%|███████▌  | 83551/110000 [03:09<01:00, 434.79it/s]

Found 890 valid entries so far


Processing entries:  77%|███████▋  | 84365/110000 [03:11<00:56, 449.87it/s]

Found 900 valid entries so far


Processing entries:  78%|███████▊  | 85745/110000 [03:14<00:50, 478.55it/s]

Found 910 valid entries so far


Processing entries:  79%|███████▊  | 86437/110000 [03:16<00:52, 447.96it/s]

Found 920 valid entries so far


Processing entries:  80%|███████▉  | 87595/110000 [03:18<00:54, 414.47it/s]

Found 930 valid entries so far


Processing entries:  80%|████████  | 88409/110000 [03:20<00:54, 397.66it/s]

Found 940 valid entries so far


Processing entries:  82%|████████▏ | 89839/110000 [03:24<00:41, 483.41it/s]

Found 950 valid entries so far


Processing entries:  82%|████████▏ | 90468/110000 [03:25<00:38, 513.13it/s]

Found 960 valid entries so far


Processing entries:  83%|████████▎ | 91077/110000 [03:26<00:38, 487.27it/s]

Found 970 valid entries so far


Processing entries:  84%|████████▎ | 91933/110000 [03:28<00:38, 463.53it/s]

Found 980 valid entries so far


Processing entries:  84%|████████▍ | 92677/110000 [03:30<00:38, 447.54it/s]

Found 990 valid entries so far


Processing entries:  85%|████████▌ | 93605/110000 [03:32<00:35, 455.51it/s]

Found 1000 valid entries so far


Processing entries:  86%|████████▌ | 94816/110000 [03:35<00:32, 461.92it/s]

Found 1010 valid entries so far


Processing entries:  87%|████████▋ | 95492/110000 [03:36<00:29, 489.28it/s]

Found 1020 valid entries so far


Processing entries:  87%|████████▋ | 96153/110000 [03:37<00:27, 495.51it/s]

Found 1030 valid entries so far


Processing entries:  88%|████████▊ | 96732/110000 [03:39<00:29, 448.71it/s]

Found 1040 valid entries so far


Processing entries:  89%|████████▊ | 97493/110000 [03:40<00:25, 500.20it/s]

Found 1050 valid entries so far


Processing entries:  89%|████████▉ | 98160/110000 [03:42<00:26, 446.14it/s]

Found 1060 valid entries so far


Processing entries:  90%|████████▉ | 98706/110000 [03:43<00:23, 490.57it/s]

Found 1070 valid entries so far


Processing entries:  91%|█████████ | 99633/110000 [03:45<00:21, 490.73it/s]

Found 1080 valid entries so far


Processing entries:  91%|█████████ | 100296/110000 [03:46<00:19, 493.72it/s]

Found 1090 valid entries so far


Processing entries:  92%|█████████▏| 100913/110000 [03:48<00:19, 463.66it/s]

Found 1100 valid entries so far


Processing entries:  93%|█████████▎| 101986/110000 [03:50<00:16, 494.36it/s]

Found 1110 valid entries so far


Processing entries:  93%|█████████▎| 102543/110000 [03:51<00:15, 486.81it/s]

Found 1120 valid entries so far


Processing entries:  94%|█████████▍| 103386/110000 [03:53<00:13, 485.90it/s]

Found 1130 valid entries so far


Processing entries:  95%|█████████▍| 104192/110000 [03:55<00:12, 467.07it/s]

Found 1140 valid entries so far


Processing entries:  95%|█████████▌| 104943/110000 [03:56<00:11, 454.50it/s]

Found 1150 valid entries so far


Processing entries:  96%|█████████▌| 105625/110000 [03:58<00:10, 432.87it/s]

Found 1160 valid entries so far


Processing entries:  97%|█████████▋| 106610/110000 [04:00<00:07, 478.27it/s]

Found 1170 valid entries so far


Processing entries:  98%|█████████▊| 107951/110000 [04:03<00:04, 470.02it/s]

Found 1180 valid entries so far


Processing entries:  99%|█████████▉| 108755/110000 [04:05<00:02, 479.64it/s]

Found 1190 valid entries so far


Processing entries: 100%|█████████▉| 109621/110000 [04:07<00:01, 314.58it/s]

Found 1200 valid entries so far


Processing entries: 100%|██████████| 110000/110000 [04:08<00:00, 441.83it/s]


Found and saved 1201 entries matching criteria to filtered_train.jsonl
Converted entries saved to filtered_train.csv
Average word count: 110.00
Min word count: 100
Max word count: 120
Word Count Statistics:
        word_count
count  1201.000000
mean    110.000833
std       5.971111
min     100.000000
25%     105.000000
50%     110.000000
75%     115.000000
max     120.000000
Word count distribution plot saved to 'word_count_distribution.png'
Processing complete. 1201 entries filtered and saved.
