# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # Or PorterStemmer if you prefer
from nltk.tokenize import word_tokenize
import string
# import re # Not strictly needed with current punctuation removal method
import time
import os # To create output directory if needed


# Downloading necessary NLTK data

In [2]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# DATA PREPROCESSING

In [3]:
# --- Configuration ---
# !! IMPORTANT: Set these paths correctly before running !!
TRAIN_FILE_PATH = '/kaggle/input/assignment2nlp/train.csv' # <--- CHANGE THIS
TEST_FILE_PATH = '/kaggle/input/assignment2nlp/test.csv'   # <--- CHANGE THIS

VALIDATION_SET_SIZE = 500
OUTPUT_DIR = 'processed_data_A' # Directory to save outputs for Part A
PROCESSED_TRAIN_FILE = os.path.join(OUTPUT_DIR, 'train_processed.csv')
PROCESSED_VAL_FILE = os.path.join(OUTPUT_DIR, 'validation_processed.csv')
PROCESSED_TEST_FILE = os.path.join(OUTPUT_DIR, 'test_processed.csv')



def load_data(filepath):
    """Loads data from a CSV file into a pandas DataFrame."""
    print(f"Loading data from {filepath}...")
    if not os.path.exists(filepath):
        print(f"Error: File not found at {filepath}.")
        print("Please ensure the TRAIN_FILE_PATH and TEST_FILE_PATH variables are set correctly.")
        exit()
    try:
        df = pd.read_csv(filepath)
        # Ensure 'text' and 'title' columns exist and handle potential NaNs
        if 'text' not in df.columns or 'title' not in df.columns:
             raise ValueError("CSV file must contain 'text' and 'title' columns.")
        df.dropna(subset=['text', 'title'], inplace=True)
        df['text'] = df['text'].astype(str)
        df['title'] = df['title'].astype(str)
        print(f"Loaded {len(df)} rows.")
        return df
    except Exception as e:
        print(f"Error loading data from {filepath}: {e}")
        exit()

def create_validation_set(df, val_size, random_seed=42):
    """Splits a DataFrame into training and validation sets."""
    print(f"Creating validation set of size {val_size}...")
    if val_size >= len(df):
        print(f"Warning: Validation set size ({val_size}) is >= dataset size ({len(df)}). Using all but 1 sample for training.")
        val_size = len(df) - 1
        if val_size <= 0:
             raise ValueError("Cannot create validation set. Dataset is too small.")

    val_df = df.sample(n=val_size, random_state=random_seed)
    train_df = df.drop(val_df.index).reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    print(f"New training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    return train_df, val_df

# Initialize Lemmatizer and Stopwords (do this once for efficiency)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# You can add custom stopwords if you identify any specific ones
# stop_words.update(['wikipedia', 'article', 'references', 'external', 'links'])

def preprocess_text(text):
    """Applies preprocessing steps to a single text string."""
    if not isinstance(text, str):
        return "" # Handle potential non-string inputs

    # 1. Lowercase
    text = text.lower()

    # 2. Remove non-ASCII characters
    # Ensures that any remaining non-ASCII chars after lowercasing are handled
    text = text.encode('ascii', 'ignore').decode('ascii', 'ignore')

    # 3. Remove punctuation
    # Create a translation table: maps punctuation to None (for removal)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # 4. Tokenize (split into words)
    tokens = word_tokenize(text)

    # 5. Remove Stopwords and Lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(word) # Lemmatize first
        for word in tokens
        if word.isalpha() # Keep only words containing letters
           and word not in stop_words # Remove stopwords
           and len(word) > 1 # Optional: remove single-character tokens
    ]

    # 6. Join tokens back into a single string, separated by space
    return " ".join(processed_tokens)

def apply_preprocessing_to_df(df):
    """Applies preprocessing to 'text' and 'title' columns of a DataFrame."""
    # Create new columns for processed data
    # Show progress using tqdm if installed, otherwise just run apply
    try:
        from tqdm.auto import tqdm
        tqdm.pandas()
        print("Applying preprocessing to 'text' column (using tqdm)...")
        df['processed_text'] = df['text'].progress_apply(preprocess_text)
        print("Applying preprocessing to 'title' column (using tqdm)...")
        df['processed_title'] = df['title'].progress_apply(preprocess_text)
    except ImportError:
        print("Applying preprocessing to 'text' column...")
        df['processed_text'] = df['text'].apply(preprocess_text)
        print("Applying preprocessing to 'title' column...")
        df['processed_title'] = df['title'].apply(preprocess_text)


    # Optional: Report how many rows became empty after processing
    empty_text_count = (df['processed_text'] == "").sum()
    empty_title_count = (df['processed_title'] == "").sum()
    if empty_text_count > 0:
        print(f"Warning: {empty_text_count} rows have empty 'processed_text' after preprocessing.")
    if empty_title_count > 0:
        print(f"Warning: {empty_title_count} rows have empty 'processed_title' after preprocessing.")

    # Keep only rows where both processed text and title are non-empty
    original_len = len(df)
    df.dropna(subset=['processed_text', 'processed_title'], inplace=True) # Remove rows where processed columns are None
    df = df[(df['processed_text'].str.strip() != "") & (df['processed_title'].str.strip() != "")] # Remove rows where processed columns are empty strings
    df = df.reset_index(drop=True)

    if len(df) < original_len:
        print(f"Removed {original_len - len(df)} rows due to empty processed text or title.")

    # Select only the processed columns for saving (or keep original if needed)
    return df[['processed_text', 'processed_title']]


def save_data(df, filepath):
    """Saves DataFrame to CSV, creating directory if needed."""
    print(f"Saving data to {filepath}...")
    try:
        # Create the output directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        df.to_csv(filepath, index=False, encoding='utf-8')
        print(f"Successfully saved {len(df)} rows.")
    except Exception as e:
        print(f"Error saving data to {filepath}: {e}")

# MAIN EXECUTION

In [4]:
# --- Main Execution Block ---
if __name__ == "__main__":
    total_start_time = time.time()
    print("--- Starting Assignment 2: Part A ---")

    # Make sure the user has set the paths
    if 'path/to/your/' in TRAIN_FILE_PATH or 'path/to/your/' in TEST_FILE_PATH:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print("!!! ERROR: Please update TRAIN_FILE_PATH and TEST_FILE_PATH !!!")
        print("!!!        variables in the script before running.        !!!")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        exit()


    # 1. Load Data
    load_split_start_time = time.time()
    train_df_raw = load_data(TRAIN_FILE_PATH)
    test_df_raw = load_data(TEST_FILE_PATH)

    # 2. Create Validation Set
    train_df_raw, validation_df_raw = create_validation_set(train_df_raw, VALIDATION_SET_SIZE)
    load_split_end_time = time.time()
    load_split_duration = load_split_end_time - load_split_start_time
    print(f"\nTime taken for data loading and splitting: {load_split_duration:.2f} seconds")

    # 3. Apply Preprocessing
    preprocessing_start_time = time.time()
    print("\nApplying preprocessing to Training set...")
    train_df_processed = apply_preprocessing_to_df(train_df_raw.copy()) # Use .copy() to avoid SettingWithCopyWarning

    print("\nApplying preprocessing to Validation set...")
    validation_df_processed = apply_preprocessing_to_df(validation_df_raw.copy())

    print("\nApplying preprocessing to Test set...")
    test_df_processed = apply_preprocessing_to_df(test_df_raw.copy())
    preprocessing_end_time = time.time()
    preprocessing_duration = preprocessing_end_time - preprocessing_start_time
    print(f"\nTime taken for preprocessing all sets: {preprocessing_duration:.2f} seconds")

    # 4. Save Processed Data
    saving_start_time = time.time()
    print("\nSaving processed data...")
    save_data(train_df_processed, PROCESSED_TRAIN_FILE)
    save_data(validation_df_processed, PROCESSED_VAL_FILE)
    save_data(test_df_processed, PROCESSED_TEST_FILE)
    saving_end_time = time.time()
    saving_duration = saving_end_time - saving_start_time
    print(f"\nTime taken for saving processed data: {saving_duration:.2f} seconds")

    # --- Summary ---
    total_end_time = time.time()
    total_duration = total_end_time - total_start_time
    print("\n--- Part A Summary ---")
    print(f"Final Train Set Size (processed): {len(train_df_processed)}")
    print(f"Final Validation Set Size (processed): {len(validation_df_processed)}")
    print(f"Final Test Set Size (processed): {len(test_df_processed)}")
    print(f"Processed files saved in directory: '{OUTPUT_DIR}'")
    print(f"\nTotal execution time for Part A: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")
    print("--- Part A Complete ---")

--- Starting Assignment 2: Part A ---
Loading data from /kaggle/input/assignment2nlp/train.csv...
Loaded 13879 rows.
Loading data from /kaggle/input/assignment2nlp/test.csv...
Loaded 100 rows.
Creating validation set of size 500...
New training set size: 13379
Validation set size: 500

Time taken for data loading and splitting: 5.18 seconds

Applying preprocessing to Training set...
Applying preprocessing to 'text' column (using tqdm)...


  0%|          | 0/13379 [00:00<?, ?it/s]

Applying preprocessing to 'title' column (using tqdm)...


  0%|          | 0/13379 [00:00<?, ?it/s]

Removed 136 rows due to empty processed text or title.

Applying preprocessing to Validation set...
Applying preprocessing to 'text' column (using tqdm)...


  0%|          | 0/500 [00:00<?, ?it/s]

Applying preprocessing to 'title' column (using tqdm)...


  0%|          | 0/500 [00:00<?, ?it/s]

Removed 5 rows due to empty processed text or title.

Applying preprocessing to Test set...
Applying preprocessing to 'text' column (using tqdm)...


  0%|          | 0/100 [00:00<?, ?it/s]

Applying preprocessing to 'title' column (using tqdm)...


  0%|          | 0/100 [00:00<?, ?it/s]


Time taken for preprocessing all sets: 160.81 seconds

Saving processed data...
Saving data to processed_data_A/train_processed.csv...
Successfully saved 13243 rows.
Saving data to processed_data_A/validation_processed.csv...
Successfully saved 495 rows.
Saving data to processed_data_A/test_processed.csv...
Successfully saved 100 rows.

Time taken for saving processed data: 3.81 seconds

--- Part A Summary ---
Final Train Set Size (processed): 13243
Final Validation Set Size (processed): 495
Final Test Set Size (processed): 100
Processed files saved in directory: 'processed_data_A'

Total execution time for Part A: 169.80 seconds (2.83 minutes)
--- Part A Complete ---


print statements like loading data are used to monitor which part of code is running
