In [5]:
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt

# File path
file_path = '/Users/buddy/Desktop/School/effective-adventure/effective-adventure/data/last_month_posts_comments.json'

# Load JSON data
with open(file_path, 'r') as f:
    data = json.load(f)

# Display data structure
print("Data Loaded Successfully")
print(f"Total Posts: {len(data)}")

Data Loaded Successfully
Total Posts: 1930


In [6]:
# Flatten posts data into a pandas DataFrame
posts = []
for post_id, post_data in data.items():
    posts.append({
        "post_id": post_id,
        "title": post_data.get("title", ""),
        "subreddit": post_data.get("subreddit", ""),
        "score": post_data.get("score", 0),
        "author": post_data.get("author", ""),
        "created_utc": post_data.get("created_utc", 0),
        "num_comments": len(post_data.get("comments", []))
    })

posts_df = pd.DataFrame(posts)

# Explore the data
print(posts_df.info())
print(posts_df.describe())
print(posts_df.head())

# Sort and display top 5 posts by number of comments
top_posts = posts_df.sort_values(by="num_comments", ascending=False).head(5)
print("Top 5 Posts by Number of Comments:")
print(top_posts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   post_id       1930 non-null   object 
 1   title         1930 non-null   object 
 2   subreddit     1930 non-null   object 
 3   score         1930 non-null   int64  
 4   author        1924 non-null   object 
 5   created_utc   1930 non-null   float64
 6   num_comments  1930 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 105.7+ KB
None
             score   created_utc  num_comments
count  1930.000000  1.930000e+03   1930.000000
mean     17.266321  1.732006e+09      8.867876
std      58.344631  7.072252e+05     17.592108
min       0.000000  1.730784e+09      0.000000
25%       1.000000  1.731427e+09      1.000000
50%       2.000000  1.731986e+09      4.000000
75%       8.000000  1.732591e+09     10.000000
max    1072.000000  1.733341e+09    360.000000
   post_id     

In [7]:
# Flatten comments data into a pandas DataFrame
comments = []
for post_id, post_data in data.items():
    for comment in post_data.get("comments", []):
        comments.append({
            "comment_id": comment["comment_id"],
            "post_id": post_id,
            "parent_id": comment["parent_id"],
            "author": comment.get("author", ""),
            "content": comment.get("content", ""),
            "upvotes": comment.get("upvotes", 0),
            "created_utc": comment.get("created_utc", 0)
        })

comments_df = pd.DataFrame(comments)

# Explore the data
print(comments_df.info())
print(comments_df.describe())
print(comments_df.head())

# Count comments grouped by posts
comments_count = comments_df.groupby("post_id").size().reset_index(name="num_comments")
print("Top 5 Posts by Comment Count from Comments DataFrame:")
print(comments_count.sort_values(by="num_comments", ascending=False).head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17115 entries, 0 to 17114
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   comment_id   17115 non-null  object 
 1   post_id      17115 non-null  object 
 2   parent_id    17115 non-null  object 
 3   author       17017 non-null  object 
 4   content      17115 non-null  object 
 5   upvotes      17115 non-null  int64  
 6   created_utc  17115 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 936.1+ KB
None
            upvotes   created_utc
count  17115.000000  1.711500e+04
mean       3.010283  1.731997e+09
std        8.496458  6.565785e+05
min     -134.000000  1.730804e+09
25%        1.000000  1.731469e+09
50%        2.000000  1.731965e+09
75%        3.000000  1.732558e+09
max      428.000000  1.733196e+09
  comment_id  post_id   parent_id                author  \
0    lvxc0hc  1glv7mp  t3_1glv7mp        pharmacreation   
1    lvxfns2  1gluaus

In [8]:
# Get the top post by number of comments
top_post_id = top_posts.iloc[0]["post_id"]
top_post_comments = comments_df[comments_df["post_id"] == top_post_id]

# Build a graph to represent the comment tree
comment_graph = nx.DiGraph()

# Add nodes and edges
for _, comment in top_post_comments.iterrows():
    comment_id = comment["comment_id"]
    parent_id = comment["parent_id"]
    comment_graph.add_node(comment_id)  # Add comment as a node
    if parent_id.startswith("t1_"):  # Parent is another comment
        comment_graph.add_edge(parent_id[3:], comment_id)
    elif parent_id.startswith("t3_"):  # Parent is the original post
        comment_graph.add_edge(top_post_id, comment_id)

# Draw the graph
plt.figure(figsize=(12, 8))
pos = nx.nx_agraph.graphviz_layout(comment_graph, prog="dot")
nx.draw(comment_graph, pos, with_labels=False, node_size=50, arrows=False)
plt.title(f"Comment Tree for Post: {top_posts.iloc[0]['title'][:50]}...", fontsize=14)
plt.show()

ImportError: requires pygraphviz http://pygraphviz.github.io/

<Figure size 1200x800 with 0 Axes>

In [9]:
pip install pygraphviz

Collecting pygraphviz
  Downloading pygraphviz-1.14.tar.gz (106 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for pygraphviz [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[60 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build/lib.macosx-14.0-arm64-cpython-312/pygraphviz
  [31m   [0m copying pygraphviz/scraper.py -> build/lib.macosx-14.0-arm64-cpython-312/pygraphviz
  [31m   [0m copying pygraphviz/graphviz.py -> build/lib.macosx-14.0-arm64-cpython-312/pygraphviz
  [31m