## Notebook 01: Data Preprocessing

This notebook uses the `load_and_preprocess_data` function from `src.preprocessing` to load raw data, apply cleaning and filtering, and save the processed output.

**Target Data:** Semantic Scholar data (combining 'title' and 'abstract').

In [1]:
# ## 1. Imports and Setup

import os
import sys
import pandas as pd
import logging

# --- Add src directory to Python path ---
# This allows importing modules from src. Adjust path if notebook is moved.
module_path = os.path.abspath(os.path.join('..')) 
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")
else:
    print(f"{module_path} already in sys.path")

# --- Import the preprocessing function ---
try:
    from src.preprocessing import load_and_preprocess_data 
    print("Successfully imported 'load_and_preprocess_data' from src.preprocessing")
except ImportError as e:
    print(f"Error importing functions: {e}")
    print("Ensure the 'src' directory is in the Python path and preprocessing.py exists.")
except Exception as e:
    print(f"An unexpected error occurred during import: {e}")

# --- Configure Logging ---
# Basic logging setup for notebook visibility
# Use force=True to allow reconfiguring logging in Jupyter environment
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

Added g:\BERTopic_Modeling to sys.path
Successfully imported 'load_and_preprocess_data' from src.preprocessing


In [7]:
# ## 2. Load Configuration from YAML file

from src.utils import load_config

# --- Define paths ---
project_root_dir = os.path.abspath(os.path.join('..'))
config_path = os.path.join(project_root_dir, 'configs', 'reddit_config.yaml')

# --- Load configuration ---
try:
    config = load_config(config_path)
    logging.info(f"Successfully loaded configuration from {config_path}")
except FileNotFoundError as e:
    logging.error(e)
    config = None
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")
    config = None

if config:
    # --- Resolve relative paths to absolute paths ---
    # This makes the script runnable from any directory
    config['paths']['raw_data_file'] = os.path.join(project_root_dir, config['paths']['raw_data_file'])
    config['paths']['processed_data_output_file'] = os.path.join(project_root_dir, config['paths']['processed_data_output_file'])
    config['paths']['dropped_rows_output_file'] = os.path.join(project_root_dir, config['paths']['dropped_rows_output_file'])

    logging.info(f"Raw data file path: {config['paths']['raw_data_file']}")
    logging.info(f"Processed data output file path: {config['paths']['processed_data_output_file']}")
    logging.info(f"Dropped rows output file path: {config['paths']['dropped_rows_output_file']}")

2025-07-28 11:18:36,916 - INFO - Successfully loaded configuration from: g:\BERTopic_Modeling\configs\reddit_config.yaml
2025-07-28 11:18:36,917 - INFO - Successfully loaded configuration from g:\BERTopic_Modeling\configs\reddit_config.yaml
2025-07-28 11:18:36,917 - INFO - Raw data file path: g:\BERTopic_Modeling\data/raw/IFS_test.csv
2025-07-28 11:18:36,918 - INFO - Processed data output file path: g:\BERTopic_Modeling\data/processed/IFS_test_processed.csv
2025-07-28 11:18:36,918 - INFO - Dropped rows output file path: g:\BERTopic_Modeling\data/processed/IFS_test_dropped.csv


In [11]:
# ## 3. Run Preprocessing

if config:
    logging.info(f"Attempting to load and preprocess data from: {config['paths']['raw_data_file']}")
    
    processed_df = None
    
    try:
        processed_df = load_and_preprocess_data(
            file_path=config['paths']['raw_data_file'],
            text_source_columns=config['data_source']['text_source_columns'],
            unique_id_column=config['data_source']['unique_id_column'],
            required_columns_for_docs_creation=config['data_source']['required_columns_for_docs_creation'],
            dropped_rows_output_path=config['paths']['dropped_rows_output_file'],
            data_type_specific_df_processing=config['data_source']['data_type_specific_df_processing'],
            clean_apply_unescape=config['preprocessing']['cleaning']['apply_unescape'],
            clean_apply_url_removal=config['preprocessing']['cleaning']['apply_url_removal'],
            clean_apply_html_tag_removal=config['preprocessing']['cleaning']['apply_html_tag_removal'],
            clean_apply_quote_normalization=config['preprocessing']['cleaning']['apply_quote_normalization'],
            clean_apply_char_filtering=config['preprocessing']['cleaning']['apply_char_filtering'],
            clean_char_filter_regex=config['preprocessing']['cleaning']['char_filter_regex'],
            clean_apply_html_entity_removal=config['preprocessing']['cleaning']['apply_html_entity_removal'],
            clean_apply_lowercase=config['preprocessing']['cleaning']['apply_lowercase'],
            apply_length_filter=config['preprocessing']['filters']['apply_length_filter'],
            min_doc_length=config['preprocessing']['filters']['min_doc_length'],
            max_doc_length=config['preprocessing']['filters']['max_doc_length'],
            apply_duplicate_removal=config['preprocessing']['filters']['apply_duplicate_removal'],
            column_for_duplicate_checking=config['preprocessing']['filters']['column_for_duplicate_checking'],
            apply_score_filter=config['preprocessing']['filters']['apply_score_filter'],
            score_column_for_filtering=config['preprocessing']['filters']['score_column_for_filtering'],
            min_score_for_filtering=config['preprocessing']['filters']['min_score_for_filtering'],
            max_score_for_filtering=config['preprocessing']['filters']['max_score_for_filtering']
        )
    
        if processed_df is not None:
            logging.info(f"Preprocessing finished. Processed DataFrame shape: {processed_df.shape}")
            if not processed_df.empty:
                try:
                    output_dir = os.path.dirname(config['paths']['processed_data_output_file'])
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                        logging.info(f"Created output directory: {output_dir}")
                    processed_df.to_csv(config['paths']['processed_data_output_file'], index=False)
                    logging.info(f"Processed DataFrame saved to: {config['paths']['processed_data_output_file']}")
                except Exception as e:
                    logging.error(f"Error saving processed DataFrame: {e}")
            else:
                logging.info("Processed DataFrame is empty, not saving main output file.")
        else:
            logging.warning("Preprocessing did not return a DataFrame.")
    
    except FileNotFoundError as e:
        logging.error(f"Input file path error: {e}. Please ensure the path is correct in your config file.")
    except KeyError as e:
        logging.error(f"Configuration error: Missing key {e}. Please check your config file.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during preprocessing: {e}", exc_info=True)
else:
    logging.error("Configuration not loaded. Cannot run preprocessing.")

2025-07-28 11:29:53,795 - INFO - Attempting to load and preprocess data from: g:\BERTopic_Modeling\data/raw/IFS_test.csv


Starting preprocessing for: g:\BERTopic_Modeling\data/raw/IFS_test.csv


2025-07-28 11:29:54,380 - INFO - Preprocessing finished. Processed DataFrame shape: (112156, 12)


Original dataset shape: (112156, 12)
Checking for missing values in required columns: ['selftext']
No rows dropped due to missing/empty values in required columns.
Error: Text source columns ['title', 'selftext'] not found in the (potentially filtered) DataFrame.


2025-07-28 11:29:55,059 - INFO - Processed DataFrame saved to: g:\BERTopic_Modeling\data/processed/IFS_test_processed.csv


In [9]:
# ## 4. Inspect Output

from IPython.display import display 

print("--- Main Processed DataFrame --- ")
if 'processed_df' in locals() and processed_df is not None and not processed_df.empty:
    # Use the path from the config dictionary
    output_file_path = config['paths']['processed_data_output_file']
    dropped_rows_path = config['paths']['dropped_rows_output_file']

    print(f"\nProcessed DataFrame Info ({output_file_path}):")
    processed_df.info()
    print("\nFirst 5 rows of processed data:")
    display(processed_df.head())
    if 'docs' in processed_df.columns:
        print("\nSample of 'docs' column (first 3 documents):")
        for i, doc in enumerate(processed_df['docs'].head(3)):
            print(f"Doc {i+1}: {doc[:200]}...") 
    print(f"\nOutput file should be at: {output_file_path}")
    print(f"Does output file exist? {os.path.exists(output_file_path)}")

elif 'processed_df' in locals() and processed_df is not None and processed_df.empty:
     print("\nPreprocessing resulted in an empty DataFrame. Check filters and source data.")
     # Use the path from the config dictionary
     output_file_path = config['paths']['processed_data_output_file']
     print(f"Output file path specified: {output_file_path}")
     print(f"Does (potentially empty) output file exist? {os.path.exists(output_file_path)}")
else:
    print("\nPreprocessing failed or DataFrame was not created/returned correctly.")
    # Use the path from the config dictionary
    if 'config' in locals() and config:
        print(f"Expected output file: {config['paths']['processed_data_output_file']}")


print("\n--- Dropped Rows (due to missing required columns) --- ")
# Use the path from the config dictionary
dropped_rows_path = config['paths']['dropped_rows_output_file']
if os.path.exists(dropped_rows_path):
    try:
        df_dropped_check = pd.read_csv(dropped_rows_path)
        print(f"Successfully loaded dropped rows file: {dropped_rows_path}")
        print(f"Number of rows dropped due to missing required columns: {len(df_dropped_check)}")
        if not df_dropped_check.empty:
            print("\nFirst 5 rows of dropped data:")
            display(df_dropped_check.head())
        else:
            print("The dropped rows file is empty.")
    except Exception as e:
        print(f"Error loading or inspecting dropped rows file {dropped_rows_path}: {e}")
else:
    print(f"Dropped rows file not found at: {dropped_rows_path}. This is expected if no rows were dropped.")

--- Main Processed DataFrame --- 

Processed DataFrame Info (g:\BERTopic_Modeling\data/processed/IFS_test_processed.csv):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112156 entries, 0 to 112155
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     112156 non-null  object 
 1   link_id                112156 non-null  object 
 2   author                 112156 non-null  object 
 3   created_utc            112156 non-null  float64
 4   subreddit              112156 non-null  object 
 5   body                   112156 non-null  object 
 6   score                  112156 non-null  int64  
 7   all_awardings          112155 non-null  object 
 8   gildings               112156 non-null  object 
 9   total_awards_received  112155 non-null  float64
 10  author_flair_text      0 non-null       float64
 11  author_flair_richtext  108899 non-null  object 
dtypes: float64(3), int64

Unnamed: 0,id,link_id,author,created_utc,subreddit,body,score,all_awardings,gildings,total_awards_received,author_flair_text,author_flair_richtext
0,efgpylf,t3_ah7kd8,hubblekeat,1548962000.0,InternalFamilySystems,"Hey there, I hope others can chime in and offe...",1,,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",,,[]
1,ej34hly,t3_b3yw8m,[deleted],1553218000.0,InternalFamilySystems,[removed],1,[],{},0.0,,
2,ejvolcw,t3_b7t3o6,NervousGuidance,1554133000.0,InternalFamilySystems,"I'd recommend ""Self-Therapy"" by Jay Earley. It...",2,[],{},0.0,,[]
3,ejvq37d,t3_b7t3o6,coquitam,1554134000.0,InternalFamilySystems,"Thank you, this looks good, just what I hoped ...",1,[],{},0.0,,[]
4,ejvqitv,t3_b7t3o6,NervousGuidance,1554135000.0,InternalFamilySystems,You're welcome!,1,[],{},0.0,,[]



Output file should be at: g:\BERTopic_Modeling\data/processed/IFS_test_processed.csv
Does output file exist? True

--- Dropped Rows (due to missing required columns) --- 
Dropped rows file not found at: g:\BERTopic_Modeling\data/processed/IFS_test_dropped.csv. This is expected if no rows were dropped.
