In [None]:

import datetime
import os
import pandas as pd
from typing import List
from scraper.Comment import Comment
from scraper.JsonLoader import JsonLoader
from scraper.RedditJsonLoader import RedditJsonLoader
from scraper.RawCommentParser import RawCommentParser
from scraper.RedditCommentLoader import RedditCommentLoader
from scraper.RedditCommentSerializer import RedditCommentSerializer

In [23]:
def print_comments(comments, level=0):
    for comment in comments:
        print('  ' * level + comment.body)
        print_comments(comment.children, level + 1)

In [24]:
def validate_mode_and_path(mode, local_data_path):
    """
    Validate the mode and local data path to ensure they are compatible.
    
    Args:
        mode (str): The mode to use ('RAW_JSON', 'JSON', 'URL', 'CSV')
        local_data_path (str): Path to the local data file
        
    Returns:
        bool: True if valid, False otherwise
    """
    # Define supported modes and file extensions
    supported_modes = ['RAW_JSON', 'JSON', 'URL', 'CSV']
    file_extensions = {
        'json': ['JSON', 'RAW_JSON'],
        'csv': ['CSV']
    }

    # Validate the mode
    if mode not in supported_modes:
        print(f'Invalid mode: Select one of {", ".join(supported_modes)}')
        return False

    # For non-URL modes, validate that the file exists and has matching extension
    if mode != 'URL':
        if not os.path.exists(local_data_path):
            print('Local data file not found. LOCAL_DATA_PATH must be set for any mode other than URL')
            return False
        
        # Check if file extension matches the mode
        file_ext = os.path.splitext(local_data_path)[1].lower().replace('.', '')
        valid_modes = file_extensions.get(file_ext, [])
        
        if mode not in valid_modes:
            print(f'Mode {mode} is not compatible with file extension .{file_ext}')
            print(f'For .{file_ext} files, use one of: {", ".join(valid_modes)}')
            return False
            
    return True

In [None]:
# Get mode and local data path
mode = os.getenv('MODE', 'URL')
output_path = os.getenv('BASE_PATH', 'Dataset/output')
local_data_path = None
if mode == 'URL':
    local_data_path = None  # Not needed for URL mode
elif mode == 'CSV':
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'Dataset/sample/comments.csv')
else:  # JSON or RAW_JSON modes
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'Dataset/sample/reddit.json')

if not validate_mode_and_path(mode, local_data_path):
    exit(1)

data = None
comments = None
if mode == 'RAW_JSON':
    json_loader = JsonLoader()
    data = json_loader.load_raw(local_data_path)
    comments: List[Comment] = RawCommentParser().parse_comments(data)
elif mode == 'JSON':
    json_loader = RedditJsonLoader()
    comments: List[Comment] = json_loader.load_comments(local_data_path)
elif mode == 'URL':
    example_url =\
        'https://www.reddit.com/r/diabetes_t1/comments/1h9k636/type_1s_who_have_taken_ozempic_what_was_your/'
    data = RedditCommentLoader().get_comments(example_url)
    comments: List[Comment] = RawCommentParser().parse_comments(data)
elif mode == 'CSV':
    print('CSV mode not implemented')
    exit(1)
else:
    print('Invalid mode')
    exit(1)

print_comments(comments[:1])

In [None]:
serializer = RedditCommentSerializer()
df = serializer.to_dataframe(comments)

df.info()

In [27]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
csv_filename = os.path.join(output_path, f'{timestamp}_comments.csv')
serializer.write_to_csv(comments, csv_filename)

print(f"Comments saved to {csv_filename}")

In [None]:
json_filename = os.path.join(output_path, f'{timestamp}_comments.json')
serializer.write_to_json(comments, json_filename)

print(f"Comments saved to {json_filename}")

In [None]:
df = pd.read_csv(csv_filename)
df.head()