In [2]:
import csv
import datetime
import json
import os
import pandas as pd
import requests
from typing import Dict, List
from dataclasses import dataclass

In [3]:
@dataclass
class Comment:
    """ Dataclass for a Reddit comment """
    _id: str
    author: str
    body: str
    created_utc: int
    score: int
    parent_id: str | None
    depth: int
    ups: int
    downs: int
    num_reports: int | None
    report_reasons: str | None
    children: List['Comment']

    def to_dict(self) -> Dict:
        return {
            'id': self._id,
            'author': self.author,
            'body': self.body,
            'created_utc': self.created_utc,
            'score': self.score,
            'parent_id': self.parent_id,
            'depth': self.depth,
            'ups': self.ups,
            'downs': self.downs,
            'num_reports': self.num_reports,
            'report_reasons': self.report_reasons,
            # 'children': [c.to_dict() for c in self.children] # Omitted because the serializers handle this recursively
        }

    @classmethod
    def from_dict(cls, dict: Dict) -> 'Comment':
        return cls(
            _id=dict['id'],
            author=dict['author'],
            body=dict['body'],
            created_utc=dict['created_utc'],
            score=dict['score'],
            parent_id=dict['parent_id'],
            depth=dict['depth'],
            ups=dict['ups'],
            downs=dict['downs'],
            num_reports=dict['num_reports'],
            report_reasons=dict['report_reasons'],
            children=[cls.from_dict(c) for c in dict.get('children', [])]
        )

class RedditCommentLoader:
    def __init__(self):
        pass

    def _clean_url(self, url: str) -> str:
        # Add .json to the URL if not already present
        if not url.endswith('.json'):
            if url.endswith('/'):
                url = url + '.json'
            else:
                url = url + '/.json'
        return url

    def get_comments(self, url) -> List[Dict]:
        url = self._clean_url(url)
        # Set a user agent to avoid being blocked
        headers = {
            'User-Agent': 'python:reddit-comment-loader:v1.0 (by /u/yourname)'
        }
        
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error fetching comments: HTTP {response.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            return None
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            return None

In [4]:
class JsonLoader:
    def __init__(self):
        pass

    def load_raw(self, path) -> List[Dict]:
        if not os.path.exists(path):
            return None
        with open(path, 'r') as file:
            return json.load(file)

class RedditJsonLoader(JsonLoader):
    def __init__(self):
        super().__init__()
        pass
    
    def load_comments(self, path) -> List[Comment]:
        raw = self.load_raw(path)
        if raw is None:
            return None
        return [Comment.from_dict(comment) for comment in raw]

In [5]:
class RedditCommentSerializer:
    def __init__(self):
        pass
    
    def write_to_csv(self, comments: List[Comment], path: str) -> None:
        """Write comments to a CSV file"""
        flat_comments = self._flatten_comments(comments)
        fieldnames = Comment.__dataclass_fields__.keys()
        fieldnames = [field if field != '_id' else 'id' for field in fieldnames] # Replace _id with id for CSV header
        fieldnames.remove('children') # Remove children field from CSV
        with open(path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for comment in flat_comments:
                writer.writerow(comment)

    def write_to_json(self, comments: List[Comment], path: str) -> None:
        """Write comments to a JSON file"""
        flat_comments = self._flatten_comments(comments)
        
        with open(path, 'w', encoding='utf-8') as file:
            json.dump(flat_comments, file, indent=4)
        
    def _flatten_comments(self, comments: List[Comment], parent_id=None, depth=0) -> List[Dict]:
        """Convert nested comment structure to flat list for serialization"""
        flat_list = []
        
        for comment in comments:
            comment_dict = comment.to_dict()
            flat_list.append(comment_dict)
            
            # Add children recursively
            if comment.children:
                flat_list.extend(self._flatten_comments(
                    comment.children, 
                    parent_id=comment._id,
                    depth=depth+1
                ))
        
        return flat_list
    
    def to_dataframe(self, comments: List[Comment]):
        """Convert comments to pandas DataFrame"""
        flat_comments = self._flatten_comments(comments)
        return pd.DataFrame(flat_comments)


In [6]:
class RawCommentParser:
    def __init__(self):
        self.comments = []

    def parse_comments(self, data: dict) -> List[Comment]:
        """Parse comments from Reddit API response"""
        self.comments = []
        
        # Reddit API returns two listings: post and comments
        if len(data) >= 2 and 'data' in data[1]:
            comments_data = data[1]['data']
            if 'children' in comments_data:
                for child in comments_data['children']:
                    if child['kind'] == 't1':  # t1 is comment type
                        comment_data = child['data']
                        comment = self._create_comment_from_json(comment_data)
                        self.comments.append(comment)
                        
                        # Parse replies if they exist
                        if 'replies' in comment_data and comment_data['replies']:
                            if isinstance(comment_data['replies'], dict) and 'data' in comment_data['replies']:
                                self._parse_replies(comment_data['replies']['data']['children'], comment)
        
        return self.comments
    
    def _parse_replies(self, replies_data, parent_comment) -> None:
        """Recursively parse nested replies"""
        for reply in replies_data:
            if reply['kind'] == 't1':
                reply_data = reply['data']
                reply_comment = self._create_comment_from_json(reply_data)
                parent_comment.children.append(reply_comment)
                
                # Recursively parse nested replies
                if 'replies' in reply_data and reply_data['replies']:
                    if isinstance(reply_data['replies'], dict) and 'data' in reply_data['replies']:
                        self._parse_replies(reply_data['replies']['data']['children'], reply_comment)

    def _create_comment_from_json(self, data: dict) -> Comment:
        return Comment(
            _id=data.get('id', ''),
            author=data.get('author', ''),
            body=data.get('body', ''),
            created_utc=data.get('created_utc', 0),
            score=data.get('score', 0),
            parent_id=data.get('parent_id', None),
            depth=data.get('depth', 0),
            ups=data.get('ups', 0),
            downs=data.get('downs', 0),
            num_reports=data.get('num_reports', None),
            report_reasons=data.get('report_reasons', None),
            children=[]
        )
    
def print_comments(comments, level=0):
    for comment in comments:
        print('  ' * level + comment.body)
        print_comments(comment.children, level + 1)

In [7]:
def validate_mode_and_path(mode, local_data_path):
    """
    Validate the mode and local data path to ensure they are compatible.
    
    Args:
        mode (str): The mode to use ('RAW_JSON', 'JSON', 'URL', 'CSV')
        local_data_path (str): Path to the local data file
        
    Returns:
        bool: True if valid, False otherwise
    """
    # Define supported modes and file extensions
    supported_modes = ['RAW_JSON', 'JSON', 'URL', 'CSV']
    file_extensions = {
        'json': ['JSON', 'RAW_JSON'],
        'csv': ['CSV']
    }

    # Validate the mode
    if mode not in supported_modes:
        print(f'Invalid mode: Select one of {", ".join(supported_modes)}')
        return False

    # For non-URL modes, validate that the file exists and has matching extension
    if mode != 'URL':
        if not os.path.exists(local_data_path):
            print('Local data file not found. LOCAL_DATA_PATH must be set for any mode other than URL')
            return False
        
        # Check if file extension matches the mode
        file_ext = os.path.splitext(local_data_path)[1].lower().replace('.', '')
        valid_modes = file_extensions.get(file_ext, [])
        
        if mode not in valid_modes:
            print(f'Mode {mode} is not compatible with file extension .{file_ext}')
            print(f'For .{file_ext} files, use one of: {", ".join(valid_modes)}')
            return False
            
    return True

In [8]:
# Get mode and local data path
mode = os.getenv('MODE', 'URL')
output_path = os.getenv('BASE_PATH', 'output')
local_data_path = None
if mode == 'URL':
    local_data_path = None  # Not needed for URL mode
elif mode == 'CSV':
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'sample/comments.csv')
else:  # JSON or RAW_JSON modes
    local_data_path = os.getenv('LOCAL_DATA_PATH', 'sample/reddit.json')

if not validate_mode_and_path(mode, local_data_path):
    exit(1)

data = None
comments = None
if mode == 'RAW_JSON':
    json_loader = JsonLoader()
    data = json_loader.load_raw(local_data_path)
    comments = RawCommentParser().parse_comments(data)
elif mode == 'JSON':
    json_loader = RedditJsonLoader()
    comments = json_loader.load_comments(local_data_path)
elif mode == 'URL':
    example_url = 'https://www.reddit.com/r/diabetes_t1/comments/1h9k636/type_1s_who_have_taken_ozempic_what_was_your/'
    data = RedditCommentLoader().get_comments(example_url)
    comments = RawCommentParser().parse_comments(data)
elif mode == 'CSV':
    print('CSV mode not implemented')
    exit(1)
else:
    print('Invalid mode')
    exit(1)

print_comments(comments[:1])

T1 for close to 40 years. I’m on a Tandem TSlim pump with a Dexcom G6. I have been on Mounjaro for close to 18 months. My daily insulin usage almost immediately went from 110-120 units a day to 40-50. My A1C went from a 7.6 to a 5.6. It was a 6.0 4 months after starting Mounjaro. I’ve also lost 85-90 pounds. These drugs are a game changer for T1 management. I haven’t had any serious side effects and I’ve never increased my dosage past 7.5.
  Very similar results for me, too
  My experience is almost identical to this as well. The only thing that has ever improved my control/health as much as GLP-1 drugs is when I switched to a CGM that integrated with my pump (also on tandem + dexcom combo).
  Pretty much my results except my A1c is lower and I only lost sixty pounds. I think everyone should take mounjaro if possible. It’s a miracle drug.
  T1 40+ years. Went from TTD around 45-55 units a day to 25-40. Lost 45 pounds. Ozempic also reduces inflammation so it takes away that round face t

In [9]:
serializer = RedditCommentSerializer()
df = serializer.to_dataframe(comments)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              107 non-null    object 
 1   author          107 non-null    object 
 2   body            107 non-null    object 
 3   created_utc     107 non-null    float64
 4   score           107 non-null    int64  
 5   parent_id       107 non-null    object 
 6   depth           107 non-null    int64  
 7   ups             107 non-null    int64  
 8   downs           107 non-null    int64  
 9   num_reports     0 non-null      object 
 10  report_reasons  0 non-null      object 
dtypes: float64(1), int64(4), object(6)
memory usage: 9.3+ KB


In [10]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [11]:
csv_filename = os.path.join(output_path, f'{timestamp}_comments.csv')

serializer.write_to_csv(comments, csv_filename)

print(f"Comments saved to {csv_filename}")

Comments saved to output/2025-03-16_23_22_47_comments.csv


In [12]:
json_filename = os.path.join(output_path, f'{timestamp}_comments.json')

serializer.write_to_json(comments, json_filename)

print(f"Comments saved to {json_filename}")

Comments saved to output/2025-03-16_23_22_47_comments.json


In [13]:
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,id,author,body,created_utc,score,parent_id,depth,ups,downs,num_reports,report_reasons
0,m11jg92,HoboMinion,T1 for close to 40 years. I’m on a Tandem TSli...,1733672000.0,75,t3_1h9k636,0,75,0,,
1,m11wwzr,[deleted],"Very similar results for me, too",1733677000.0,9,t1_m11jg92,1,9,0,,
2,m125nhw,BjergerPresident,My experience is almost identical to this as w...,1733679000.0,6,t1_m11jg92,1,6,0,,
3,m139z5u,MaggieNFredders,Pretty much my results except my A1c is lower ...,1733692000.0,3,t1_m11jg92,1,3,0,,
4,m17vfp7,OranjellosBroLemonj,T1 40+ years. Went from TTD around 45-55 units...,1733764000.0,3,t1_m11jg92,1,3,0,,
