In [5]:
# Importing necessary libraries
import os
import json
import pandas as pd
from datetime import datetime
import logging
from dotenv import load_dotenv

# Load environment variables (if needed for your credentials)
load_dotenv('../.env')  # Adjust the path to your .env file

# Setup logging to monitor the scraping process
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping_log.txt"),
        logging.StreamHandler()
    ]
)

# Define the raw metadata file path
RAW_METADATA_FILE = "../raw_metadata.json"
CLEANED_METADATA_FILE = "../cleaned_metadata.json"

def load_raw_metadata():
    """ Load raw metadata from JSON file """
    if os.path.exists(RAW_METADATA_FILE):
        with open(RAW_METADATA_FILE, "r") as f:
            return json.load(f)
    else:
        logging.error("Raw metadata file does not exist.")
        return []
def clean_data(data):
    """ Perform data cleaning on raw metadata """
    # Convert raw data into a pandas DataFrame for easier manipulation
    df = pd.DataFrame(data)
    
    # 1. Remove duplicates based on 'message_id'
    df = df.drop_duplicates(subset=["message_id"], keep="first")
    
    # 2. Handle missing values: Fill NaN in 'file_path' with 'missing' and 'date' with current date
    df['file_path'].fillna('missing', inplace=True)
    
    # Ensure 'date' column is filled in case of NaN values
    df['date'].fillna(datetime.now().isoformat(), inplace=True)

    # 3. Standardize formats
    # Ensure 'date' is in ISO format (handle invalid dates gracefully)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Coerce invalid dates to NaT
    df['date'].fillna(datetime.now(), inplace=True)  # Fill NaT values with current datetime
    
    # Convert the 'date' to ISO format using apply (to handle timezone)
    df['date'] = df['date'].apply(lambda x: x.isoformat())
    
    # 4. Data Validation: Ensure 'message_id' is valid (positive integer)
    df = df[df['message_id'].apply(lambda x: isinstance(x, int) and x > 0)]
    
    return df


def save_cleaned_metadata(data):
    """ Save cleaned metadata to JSON file """
    with open(CLEANED_METADATA_FILE, "w") as f:
        json.dump(data, f, indent=4)

# Main process to clean and save data
def main():
    logging.info("Starting data cleaning process...")
    
    # Loading raw metadata
    raw_metadata = load_raw_metadata()

    if raw_metadata:
        # Clean the data
        cleaned_df = clean_data(raw_metadata)

        # Save the cleaned data
        save_cleaned_metadata(cleaned_df.to_dict(orient="records"))
        logging.info("Data cleaning completed and saved to cleaned_metadata.json.")
    else:
        logging.error("No raw data to clean.")

# Run the main function
if __name__ == "__main__":
    main()


2025-01-31 14:57:22,915 - INFO - Starting data cleaning process...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['file_path'].fillna('missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['date'].fillna(datetime.now().isoformat(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth