In [26]:
import os 
import pandas as pd
import time
import requests 
import json
import neo4j
from neo4j import GraphDatabase

from dotenv import load_dotenv

load_dotenv()

True

In [8]:
import requests
import time

def fetch_categories(base_url):
    url = f"{base_url}/categories.json"
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()
            return data.get('category_list', {}).get('categories', [])
        except ValueError:
            print("Failed to decode JSON.")
            return []
    else:
        print(f"Failed to fetch categories: {response.status_code}")
        return []

def fetch_topics(base_url, category_slug):
    topics = []
    page = 0
    while True:
        url = f"{base_url}/c/{category_slug}.json?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            break
        data = response.json()
        fetched_topics = data.get("topic_list", {}).get("topics", [])
        if not fetched_topics:
            break
        topics.extend(fetched_topics)
        page += 1
    return topics

def fetch_topic_details(base_url, topic_id):
    url = f"{base_url}/t/{topic_id}.json"
    response = requests.get(url)
    time.sleep(.5)  
    if response.status_code != 200:
        return None
    return response.json()


def organize_post_data(base_url, topic_details, category_name):
    posts_data = []
    posts = topic_details.get("post_stream", {}).get("posts", [])
    for post in posts:
        post_data = {
            "text": post.get("cooked", ""),
            "title": topic_details.get("title", ""),
            "author": post.get("username", ""),
            "tags": topic_details.get("tags", []),
            "category": category_name,  # Include category name
            "topic_id": topic_details.get("id"),  # Include topic ID
            "url": f"{base_url}/t/{topic_details.get('id')}/{post.get('post_number')}"
        }
        # Find responses to this post
        responses = [response for response in posts if response.get("reply_to_post_number") == post.get("post_number")]
        post_data["responses"] = [{
            "text": response.get("cooked", ""),
            "author": response.get("username", "")
        } for response in responses]
        posts_data.append(post_data)
    return posts_data


In [9]:
def ingest_data(base_url):
    categories = fetch_categories(base_url)
    all_posts_data = []
    for category in categories:
        category_slug = category.get('slug')
        category_name = category.get('name')
        topics = fetch_topics(base_url, category_slug)
        for topic in topics:
            topic_id = topic.get('id')
            topic_details = fetch_topic_details(base_url, topic_id)
            if topic_details:
                posts_data = organize_post_data(base_url, topic_details, category_name)
                all_posts_data.extend(posts_data)
    return all_posts_data


In [10]:
arb_base_url = "https://forum.arbitrum.foundation"

In [11]:
data = ingest_data(arb_base_url)

In [34]:
print(data[420])

{'text': '<p>R&amp;D is important for Arbitrum DAO and will have many benefits in the long term, but it is important to look at the ROI on the investment made and priority. The ARB proposed are at a relatively higher side and also it may bring lot of negative energy and power in the hands of few. I would love to see the clear framework with accountabilities to decide on.</p>', 'title': "Proposal [Non-Constitutional]: Establish the 'Arbitrum Research & Development Collective'", 'author': 'bubli.eth', 'tags': [], 'category': 'Proposals', 'topic_id': 19899, 'url': 'https://forum.arbitrum.foundation/t/19899/7', 'responses': [{'text': '<p>Hey <a class="mention" href="/u/bubli.eth">@bubli.eth</a> ,</p>\n<p>The proposal contains several legally enforceable checks &amp; balances together with several powers attributed to the DAOAdvocate (the only member seat that has legally enforceable powers), re. the regulation of the ARDC (Refer to sections on checks and balances, provisions, operational p

In [33]:
print(len(data))

6125


In [32]:
posts_json = {"posts": data}
posts_json_string = json.dumps(posts_json, indent=4)

with open('data/all_arb_forum_posts.json', 'w') as file:
    json.dump(posts_json_string, file, indent=4)

