# Filter Dataset

This notebook filters the data down to N=1000 instances, by selecting 420 questions and 580 statements at random

In [None]:
import pandas as pd
import re

In [None]:
# This script filters a dataset of ATC (Air Traffic Control) communications
# based on word count and identifies potential questions and statements.
def filter_atc_dataset(csv_path, output_csv_path, min_words=5):

    df = pd.read_csv(csv_path)

    
    print(f"Original dataset size: {len(df)} clips")
    
    # Add a column for word count
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    
    # Filter by word count
    df_filtered = df[df['word_count'] >= min_words]
    print(f"After word count filtering: {len(df_filtered)} clips")
    
    # Identify potential questions (still requires manual review)
    question_patterns = [
        r'\?',                    # Question mark
        r'\b(?:what|where|when|why|who|how|which)\b',  # WH-questions
        r'\b(?:do|does|did|is|are|was|were|have|has|had|can|could|will|would|should|may|might)\s+(?:\w+\s+)+\?',  # Yes/no questions
        r'\b(?:right|correct|copy|roger)\?',  # Common ATC confirmation questions
        r'say again',             # Common in ATC for clarification
        r'request',               # Often indicates a question in ATC context
        r'confirm',               # Confirmation requests
    ]
    
    # Create a combined pattern for detecting questions
    combined_pattern = '|'.join(question_patterns)
    
    # Mark potential questions
    df_filtered['potential_question'] = df_filtered['text'].apply(
        lambda x: bool(re.search(combined_pattern, str(x).lower())) if pd.notna(x) else False
    )
    
    # Mark potential statements 
    df_filtered['potential_statement'] = ~df_filtered['potential_question']
    
    # Save filtered dataset
    df_filtered.to_csv(output_csv_path, index=False)
    
    print(f"Potential questions: {sum(df_filtered['potential_question'])}")
    print(f"Potential statements: {sum(df_filtered['potential_statement'])}")
    
    return df_filtered

In [None]:
# Replace with actual paths
csv_path = "train_data.csv"
output_path = "filtered_train_data.csv"

filtered_df = filter_atc_dataset(
    csv_path, 
    output_path,
    min_words=2
)

In [None]:
df = pd.read_csv('filtered_train_data.csv')

questions = df[df['potential_question']]

statements = df[df['potential_statement']]


# Randomly sample 580 statements (total rows = 1000)
statements_sampled = statements.sample(n=580, random_state=23)  # Set random_state for reproducibility

# Concatenate back the questions and the sampled statements
df_filtered = pd.concat([questions, statements_sampled])

# Shuffle the final dataframe (optional)
df_filtered = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)

# add blank label column for now
df_filtered['label'] = None



print(df_filtered)

# Save the final filtered dataset of length 1000
df_filtered.to_csv('N_1000_filtered_train_data.csv', index=False)