In [2]:
import pandas as pd
from pathlib import Path

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 5)
pd.set_option("display.max_colwidth", None)

In [4]:
DATA_DIR = "../../../data/reddit"
SUBREDDIT = "plantclinic"
SIZE = 1000000000
COMMENTS = Path(DATA_DIR) / f"{SUBREDDIT}_comments"
SUBMISSIONS = Path(DATA_DIR) / f"{SUBREDDIT}_submissions"
REDUCED_COMMENTS = COMMENTS.with_suffix(f".l{SIZE}")
REDUCED_SUBMISSIONS = SUBMISSIONS.with_suffix(f".l{SIZE}")

In [5]:
assert COMMENTS.exists()
assert SUBMISSIONS.exists()

In [None]:
! head -{SIZE} {SUBMISSIONS} > {REDUCED_SUBMISSIONS}
! head -{SIZE} {COMMENTS} > {REDUCED_COMMENTS}

In [6]:
df_comments = pd.read_json(REDUCED_COMMENTS, orient="records", lines=True)
df_submissions = pd.read_json(REDUCED_SUBMISSIONS, orient="records", lines=True)

In [7]:
df_s = df_submissions.copy()
df_s = df_s[
    (df_s["num_comments"] > 5)
    & (df_s["title"].str.endswith("?"))
    & (df_s["title"].str.len() > 100)
    & (df_s["title"].str.len() < 1000)
].sort_values(by=["score"], ascending=False)
df_s = df_s[["id", "title"]]
df_s["id"] = "t3_" + df_s["id"]
df_s.shape

(8864, 2)

In [8]:
link_ids = set(df_comments[df_comments["link_id"].isin(df_s["id"])]["link_id"])

In [10]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

df_c = df_comments.copy()

def process_link_id(l_id):
    filtered = df_c[
        (df_c["link_id"] == l_id) & (df_c["parent_id"] == l_id)
    ].sort_values(by=["score"], ascending=False)
    if filtered.shape[0] > 0:
        most_upvoted_comment = filtered.iloc[0]["body"]
        return l_id, most_upvoted_comment
    return None

# Use a ProcessPoolExecutor to parallelize the loop
with ProcessPoolExecutor(max_workers=30) as executor:
    results = list(executor.map(process_link_id, link_ids))

# Filter out None values from the results
results = [result for result in results if result is not None]


In [11]:
for l_id, most_upvoted_comment in results:
    df_s.loc[df_s.id == l_id, "comment"] = most_upvoted_comment

  df_s.loc[df_s.id == l_id, "comment"] = most_upvoted_comment


In [13]:
df_s_final = df_s[df_s["comment"].str.len() > 0]

In [16]:
df_s_final

Unnamed: 0,id,title,comment
229291,t3_o4ltt2,"I forgot I was sprouting mango pits but my eldritch horrors - I mean mangos - sprouted. Enthusiastically. Um. Do I stick just the tentacles, er, roots, in the soil, with them being this spouted? Or do I bury the whole thing (in an unmarked grave and never speak of this incident again)?","I did this before and um... they looked almost that bad. I would stick them in some dirt, they'll eventually figure out they're plants."
110370,t3_i6jspj,"Plant Theives! :( We've now had 4 very sentimental plants stolen off of our patio. Today, my partners prized money tree is gone. Is there a way to make the leaf left behind root and regrow?",upvoted for visibility :(
...,...,...,...
330777,t3_vdtubd,"Two days ago I noticed these black spots on a few of my Graptosedum's leaves. This is a new plant that was repotted a week ago. The roots looked good and there are no spots on the stem. I tried wiping it off, but it doesn't come off. Any ideas as to what this could be?","I had a cactus do this. I think it’s rotting from the inside out, likely over watering. Check this link and see if it matches. \n\nhttps://www.sublimesucculents.com/root-rot-succulents/"
358036,t3_xm7zb2,"Following up after checking roots! From what I see, they’re not rotten but they’re not…abundant. Should I propagate with the offshoot or can I report in fresh soil and try to save her?","Where is the original post? Is this it: [https://www.reddit.com/r/plantclinic/comments/xgzq5x/constantly\_brown\_and\_yellow\_ive\_watered\_more\_and/](https://www.reddit.com/r/plantclinic/comments/xgzq5x/constantly_brown_and_yellow_ive_watered_more_and/)?\n\nWhile you didn't find any root rot, I see broken off roots lying in the soil to the left. The roots likely completely disintegrated.\n\nWhen you repot the plant in this post, use a VERY SMALL unglazed terracotta pot - only large enough for what is left of the roots. They need a fast draining soil, with excellent drainage to thrive. Root rot will take over if planted in too large a pot.\n\nIf that is a dracaena, they have very small and compact root systems, even when 100% healthy. This is an example of a healthy root system: [https://www.nature-and-garden.com/wp-content/uploads/2018/11/dracaena-marginata-roots.jpg](https://www.nature-and-garden.com/wp-content/uploads/2018/11/dracaena-marginata-roots.jpg)"


In [None]:
df_s_final.to_csv("PlantQA_dataset_8864.csv")