In [22]:

import datetime
import os
import pandas as pd
from typing import List
from scraper.Comment import Comment
from scraper.JsonLoader import JsonLoader
from scraper.RedditJsonLoader import RedditJsonLoader
from scraper.RawCommentParser import RawCommentParser
from scraper.RedditCommentLoader import RedditCommentLoader
from scraper.RedditCommentSerializer import RedditCommentSerializer

In [23]:
def print_comments(comments, level=0):
    for comment in comments:
        print('  ' * level + comment.body)
        print_comments(comment.children, level + 1)

In [24]:
def validate_mode_and_path(mode, local_data_path):
    """
    Validate the mode and local data path to ensure they are compatible.
    
    Args:
        mode (str): The mode to use ('RAW_JSON', 'JSON', 'URL', 'CSV')
        local_data_path (str): Path to the local data file
        
    Returns:
        bool: True if valid, False otherwise
    """
    # Define supported modes and file extensions
    supported_modes = ['RAW_JSON', 'JSON', 'URL', 'CSV']
    file_extensions = {
        'json': ['JSON', 'RAW_JSON'],
        'csv': ['CSV']
    }

    # Validate the mode
    if mode not in supported_modes:
        print(f'Invalid mode: Select one of {", ".join(supported_modes)}')
        return False

    # For non-URL modes, validate that the file exists and has matching extension
    if mode != 'URL':
        if not os.path.exists(local_data_path):
            print('Local data file not found. LOCAL_DATA_PATH must be set for any mode other than URL')
            return False
        
        # Check if file extension matches the mode
        file_ext = os.path.splitext(local_data_path)[1].lower().replace('.', '')
        valid_modes = file_extensions.get(file_ext, [])
        
        if mode not in valid_modes:
            print(f'Mode {mode} is not compatible with file extension .{file_ext}')
            print(f'For .{file_ext} files, use one of: {", ".join(valid_modes)}')
            return False
            
    return True

In [25]:
# Get mode and local data path
mode = os.getenv('MODE', 'URL')
output_path = os.getenv('BASE_PATH', 'output')
local_data_path = None
if mode == 'URL':
    local_data_path = None  # Not needed for URL mode
elif mode == 'CSV':
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'sample/comments.csv')
else:  # JSON or RAW_JSON modes
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'sample/reddit.json')

if not validate_mode_and_path(mode, local_data_path):
    exit(1)

data = None
comments = None
if mode == 'RAW_JSON':
    json_loader = JsonLoader()
    data = json_loader.load_raw(local_data_path)
    comments: List[Comment] = RawCommentParser().parse_comments(data)
elif mode == 'JSON':
    json_loader = RedditJsonLoader()
    comments: List[Comment] = json_loader.load_comments(local_data_path)
elif mode == 'URL':
    example_url = 'https://www.reddit.com/r/diabetes_t1/comments/1h9k636/type_1s_who_have_taken_ozempic_what_was_your/'
    data = RedditCommentLoader().get_comments(example_url)
    comments: List[Comment] = RawCommentParser().parse_comments(data)
elif mode == 'CSV':
    print('CSV mode not implemented')
    exit(1)
else:
    print('Invalid mode')
    exit(1)

print_comments(comments[:1])

T1 for close to 40 years. I’m on a Tandem TSlim pump with a Dexcom G6. I have been on Mounjaro for close to 18 months. My daily insulin usage almost immediately went from 110-120 units a day to 40-50. My A1C went from a 7.6 to a 5.6. It was a 6.0 4 months after starting Mounjaro. I’ve also lost 85-90 pounds. These drugs are a game changer for T1 management. I haven’t had any serious side effects and I’ve never increased my dosage past 7.5.
  Very similar results for me, too
  My experience is almost identical to this as well. The only thing that has ever improved my control/health as much as GLP-1 drugs is when I switched to a CGM that integrated with my pump (also on tandem + dexcom combo).
  Pretty much my results except my A1c is lower and I only lost sixty pounds. I think everyone should take mounjaro if possible. It’s a miracle drug.
  T1 40+ years. Went from TTD around 45-55 units a day to 25-40. Lost 45 pounds. Ozempic also reduces inflammation so it takes away that round face t

In [26]:
serializer = RedditCommentSerializer()
df = serializer.to_dataframe(comments)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              107 non-null    object 
 1   author          107 non-null    object 
 2   body            107 non-null    object 
 3   created_utc     107 non-null    float64
 4   score           107 non-null    int64  
 5   parent_id       107 non-null    object 
 6   depth           107 non-null    int64  
 7   ups             107 non-null    int64  
 8   downs           107 non-null    int64  
 9   num_reports     0 non-null      object 
 10  report_reasons  0 non-null      object 
dtypes: float64(1), int64(4), object(6)
memory usage: 9.3+ KB


In [27]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [28]:
csv_filename = os.path.join(output_path, f'{timestamp}_comments.csv')
serializer.write_to_csv(comments, csv_filename)

print(f"Comments saved to {csv_filename}")

Comments saved to output/2025-03-16_23_51_37_comments.csv


In [29]:
json_filename = os.path.join(output_path, f'{timestamp}_comments.json')
serializer.write_to_json(comments, json_filename)

print(f"Comments saved to {json_filename}")

Comments saved to output/2025-03-16_23_51_37_comments.json


In [30]:
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,id,author,body,created_utc,score,parent_id,depth,ups,downs,num_reports,report_reasons
0,m11jg92,HoboMinion,T1 for close to 40 years. I’m on a Tandem TSli...,1733672000.0,76,t3_1h9k636,0,76,0,,
1,m11wwzr,[deleted],"Very similar results for me, too",1733677000.0,9,t1_m11jg92,1,9,0,,
2,m125nhw,BjergerPresident,My experience is almost identical to this as w...,1733679000.0,6,t1_m11jg92,1,6,0,,
3,m139z5u,MaggieNFredders,Pretty much my results except my A1c is lower ...,1733692000.0,3,t1_m11jg92,1,3,0,,
4,m17vfp7,OranjellosBroLemonj,T1 40+ years. Went from TTD around 45-55 units...,1733764000.0,3,t1_m11jg92,1,3,0,,
