Notebook 01: Data Preprocessing

This notebook tests the `load_and_preprocess_data` function from `src.preprocessing`.
It loads raw data, applies cleaning/unitization based on a configuration,
and saves the processed output.

In [3]:
import nltk
import ssl

# --- Attempt to bypass SSL verification issues if they occur ---
# (Sometimes needed on corporate networks or specific setups)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
# --- End SSL bypass ---

print("Downloading NLTK 'wordnet' data package...")
nltk.download('wordnet')
print("Downloading NLTK 'omw-1.4' data package (needed for wordnet)...")
nltk.download('omw-1.4') # Often required by wordnet
print("NLTK downloads attempted.")

# Optional: Verify download path (for debugging)
print("\nNLTK data paths searched:")
print(nltk.data.path)

Downloading NLTK 'wordnet' data package...
Downloading NLTK 'omw-1.4' data package (needed for wordnet)...
NLTK downloads attempted.

NLTK data paths searched:
['C:\\Users\\snake/nltk_data', 'g:\\BERTopic_Modeling\\.venv\\nltk_data', 'g:\\BERTopic_Modeling\\.venv\\share\\nltk_data', 'g:\\BERTopic_Modeling\\.venv\\lib\\nltk_data', 'C:\\Users\\snake\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\snake\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\snake\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# ## 1. Imports and Setup

import os
import sys
import pandas as pd
import logging

# --- Add src directory to Python path ---
# This allows importing modules from src. Adjust path if notebook is moved.
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks/' dir
if module_path not in sys.path:
    sys.path.append(module_path)

# --- Import the preprocessing function ---
try:
    from src.preprocessing import load_and_preprocess_data
    from src.utils import load_config # Optional: if using YAML config file
    print("Successfully imported preprocessing functions.")
except ImportError as e:
    print(f"Error importing functions: {e}")
    print("Ensure the 'src' directory is in the Python path and files exist.")

# --- Configure Logging ---
# Basic logging setup for notebook visibility
# Use force=True to allow reconfiguring logging in Jupyter environment
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

Successfully imported preprocessing functions.


In [5]:
# ## 2. Define Configuration

# Option A: Define configuration directly as a dictionary
# -----------------------------------------------------
# !!! CONFIGURATION FOR A COMMENTS CSV FILE !!!

# Assumes paths are relative to the PROJECT ROOT (BERTopic_Modeling),
# NOT relative to the notebook itself.
raw_data_file = 'G:/BERTopic_Modeling/data/raw/IFS_test.csv' # Your comments file
processed_data_output_file = 'G:/BERTopic_Modeling/data/processed/IFS_comments_processed_docs.csv' # Output file

# Define parameters specifically for the comments file structure
preprocessing_params = {
    'column_mapping': {
        'input_id_col': 'id',           # ID column in comments CSV
        'input_text_cols': ['body'],    # *** Use 'body' column for text ***
        'output_id_col': 'doc_id',      # Standard name in output
        'output_text_col': 'text'       # Standard name for text after processing
    },
    # Adjust metadata to keep based on comments CSV columns
    'metadata_cols': ['created_utc', 'score', 'subreddit', 'link_id', 'author', 'author_flair_text'],
    'skip_missing_essential': True,     # Skips rows if 'id' or 'body' is missing/empty
    'cleaning_options': {               # Keep cleaning options consistent for now
        'html_unescape': True,
        'remove_urls': True,
        'remove_emails': True,
        'lowercase': True,
        'boilerplate_remove': [r'\[deleted\]', r'\[removed\]'],
        'custom_regex_remove': [],
        'remove_punctuation': False,
        'remove_stop_words': False,
        'lemmatize': False
    },
    'granularity': 'document',          # Process whole comments for this example
    'filtering_options': {
        'min_char_length': 20,          # Allow potentially shorter comments
        'metadata_filters': [
            {'column': 'score', 'condition': '> 0'} # Filter comments with score > 0 (example)
        ],
        'deduplicate_exact': True       # Remove exact duplicate comments
    },
    'force_recompute': False            # Set to True to ignore cache and rerun
}

# --- (Optional) Print the config to verify ---
# import json
# print("Using preprocessing parameters:")
# print(json.dumps(preprocessing_params, indent=2))


# Option B: Load configuration from a YAML file (Recommended for complex runs)
# -------------------------------------------------------------------------
# config_yaml_path = '../configs/preprocessing_config.yaml' # Path relative to notebook
# # Make sure the path below is relative to project root if utils.py uses relative paths
# config_yaml_path_for_load = 'configs/preprocessing_config.yaml'
# try:
#     preprocessing_params = load_config(config_yaml_path_for_load)
#     # Add/override file paths if they are not in the YAML
#     # raw_data_file = preprocessing_params.get('input_file', 'data/raw/default.csv')
#     # processed_data_output_file = preprocessing_params.get('output_file', 'data/processed/default_processed.csv')
#     logging.info(f"Loaded config from {config_yaml_path_for_load}")
# except FileNotFoundError:
#     logging.error(f"Config file not found: {config_yaml_path_for_load}. Using dictionary definition (Option A).")
#     # Fallback to Option A parameters defined above if file not found
# except NameError:
#      logging.error(f"load_config function not imported correctly. Using dictionary definition (Option A).")

In [6]:
# ## 3. Run Preprocessing

logging.info(f"Input file: {raw_data_file}")
logging.info(f"Output file: {processed_data_output_file}")

# Ensure the parameters dictionary is defined (from Cell 2)
if 'preprocessing_params' not in locals():
    raise NameError("preprocessing_params dictionary not defined. Please run Cell 2 first.")

# Initialize variable to store result
processed_df = None

try:
    # Call the main preprocessing function
    # Note: file paths are passed relative to where the function *runs* from
    # (usually the project root if you start jupyter from there)
    processed_df = load_and_preprocess_data(
        file_path=raw_data_file,
        output_path=processed_data_output_file,
        **preprocessing_params # Unpack the dictionary as keyword arguments
        # Or pass config_path=config_yaml_path_for_load if using Option B from Cell 2
    )

    logging.info("Preprocessing finished.")

except FileNotFoundError as e:
    logging.error(f"Input/Output file path error: {e}")
    logging.error(f"Please ensure the paths are correct relative to the project root directory.")
except ValueError as e:
    logging.error(f"Configuration or data error: {e}")
except NameError as e:
     logging.error(f"Import error - required function not loaded: {e}")
except Exception as e:
    logging.error(f"An unexpected error occurred during preprocessing: {e}", exc_info=True) # Show traceback

2025-05-04 17:50:24,800 - INFO - Input file: G:/BERTopic_Modeling/data/raw/IFS_test.csv
2025-05-04 17:50:24,800 - INFO - Output file: G:/BERTopic_Modeling/data/processed/IFS_comments_processed_docs.csv
2025-05-04 17:50:24,801 - INFO - Starting preprocessing for: G:/BERTopic_Modeling/data/raw/IFS_test.csv
2025-05-04 17:50:24,801 - INFO - Loading raw data from: G:/BERTopic_Modeling/data/raw/IFS_test.csv
2025-05-04 17:50:25,354 - INFO - Standardizing column names...
2025-05-04 17:50:25,369 - INFO - Combining text columns...
2025-05-04 17:50:25,385 - INFO - Handling missing essential data...
2025-05-04 17:50:25,418 - INFO - Applying text cleaning...
2025-05-04 17:50:28,262 - INFO - Text cleaning applied.
2025-05-04 17:50:28,262 - INFO - Unitizing text based on granularity...
2025-05-04 17:50:28,262 - INFO - Applying granularity: document
2025-05-04 17:50:28,277 - INFO - Applying filtering...
2025-05-04 17:50:28,327 - INFO - Identified 7499 rows to remove based on: char_len<20
2025-05-04 17

In [7]:
# ## 4. Inspect Output

# Check if the DataFrame was loaded/created and inspect it
if 'processed_df' in locals() and isinstance(processed_df, pd.DataFrame) and not processed_df.empty:
    print(f"\nProcessed DataFrame Info ({processed_data_output_file}):")
    # Use display() in Jupyter/VSCode notebooks for better table rendering
    from IPython.display import display
    processed_df.info()
    print("\nFirst 5 rows:")
    display(processed_df.head())
    print("\nLast 5 rows:")
    display(processed_df.tail())
    print(f"\nCheck if output file exists: {os.path.exists(processed_data_output_file)}")
elif 'processed_df' in locals() and isinstance(processed_df, pd.DataFrame) and processed_df.empty:
     print("\nPreprocessing resulted in an empty DataFrame.")
     print(f"\nCheck if empty output file exists: {os.path.exists(processed_data_output_file)}")
else:
    print("\nPreprocessing failed or DataFrame not created/returned correctly.")
    # Check if the output file was created anyway (e.g., if caching worked but function failed later)
    if os.path.exists(processed_data_output_file):
         print(f"Output file exists at {processed_data_output_file}, but DataFrame wasn't returned to notebook.")
    else:
         print(f"Output file {processed_data_output_file} does not exist.")




Processed DataFrame Info (G:/BERTopic_Modeling/data/processed/IFS_comments_processed_docs.csv):
<class 'pandas.core.frame.DataFrame'>
Index: 102896 entries, 0 to 112155
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   doc_id             102896 non-null  object 
 1   unit_id            102896 non-null  object 
 2   text_unit          102896 non-null  object 
 3   link_id            102896 non-null  object 
 4   author             102896 non-null  object 
 5   created_utc        102896 non-null  float64
 6   subreddit          102896 non-null  object 
 7   score              102896 non-null  int64  
 8   author_flair_text  0 non-null       float64
dtypes: float64(2), int64(1), object(6)
memory usage: 7.9+ MB

First 5 rows:


Unnamed: 0,doc_id,unit_id,text_unit,link_id,author,created_utc,subreddit,score,author_flair_text
0,efgpylf,efgpylf,"hey there, i hope others can chime in and offe...",t3_ah7kd8,hubblekeat,1548962000.0,InternalFamilySystems,1,
2,ejvolcw,ejvolcw,"i'd recommend ""self-therapy"" by jay earley. it...",t3_b7t3o6,NervousGuidance,1554133000.0,InternalFamilySystems,2,
3,ejvq37d,ejvq37d,"thank you, this looks good, just what i hoped ...",t3_b7t3o6,coquitam,1554134000.0,InternalFamilySystems,1,
5,el9kmpr,el9kmpr,hi i am currently in therapy with an ifs thera...,t3_ah0w3y,sparkerson,1555681000.0,InternalFamilySystems,2,
6,emxhvez,emxhvez,here's one: he's got some good youtube videos ...,t3_blratl,NervousGuidance,1557418000.0,InternalFamilySystems,2,



Last 5 rows:


Unnamed: 0,doc_id,unit_id,text_unit,link_id,author,created_utc,subreddit,score,author_flair_text
112151,m4rn4i9,m4rn4i9,this 💯 over and over. it's a tool and a map bu...,t3_1hqoub5,leaninletgo,1735686000.0,InternalFamilySystems,23,
112152,m4rnfq9,m4rnfq9,“i agree with you—it’s not for everyone. howev...,t3_1hqoub5,SoteEmpathHealer,1735686000.0,InternalFamilySystems,18,
112153,m4rnxjn,m4rnxjn,i had a hard time remembering things from chil...,t3_1hj8ylf,iwillmeetyou,1735687000.0,InternalFamilySystems,2,
112154,m4rr6vv,m4rr6vv,somatics is a very personal thing. you need to...,t3_1hqmbf0,Blissful524,1735688000.0,InternalFamilySystems,8,
112155,m4rvkep,m4rvkep,i found it really helpful to read the book som...,t3_1hqmbf0,Miserable_News975,1735689000.0,InternalFamilySystems,5,



Check if output file exists: True
