In [1]:
import json
import re
from pathlib import Path
import pandas as pd

import re

In [2]:
def load_jsonl_to_df(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

In [3]:


INPUT = Path("./../att_3/seems-good/reviews_newest_1800ish.jsonl")
OUTPUT = Path("./man_cleaned/reviews_newest_clean.jsonl")

date_pattern = re.compile(
    r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}'
)

def parse_entry(entry):
    raw = entry.get("text", "")
    lines = [l.strip() for l in raw.split("\n") if l.strip()]

    if not lines:
        return None

    username = lines[0]

    date_match = date_pattern.search(raw)
    date = date_match.group(0) if date_match else None

    review_start = 0
    for i, line in enumerate(lines):
        if date_pattern.match(line):
            review_start = i + 1
            break

    review_text = " ".join(lines[review_start:])

    review_text = re.sub(r'\s*Like Comment\s*$', '', review_text)
    review_text = re.sub(r'\s*\b20\d{2}\b\s*$', '', review_text)
    review_text = review_text.strip()

    
    return {
        "username": username,
        "date": date,
        "review_text": review_text
    }

bad = 0
with INPUT.open() as fin, OUTPUT.open("w") as fout:
    for line in fin:
        entry = json.loads(line)
        cleaned = parse_entry(entry)

        # Count only true failures
        if (
            not cleaned
            or cleaned["username"] is None
            or cleaned["date"] is None
            or cleaned["review_text"] is None
            or cleaned["review_text"].strip() == ""
        ):
            bad += 1

        fout.write(json.dumps(cleaned, ensure_ascii=False) + "\n")

print("Done. Problem rows:", bad)





Done. Problem rows: 1


In [4]:
df = pd.read_json(OUTPUT, lines=True)

# Identify truly bad rows
bad_mask = (
    df.date.isna() |
    df.review_text.isna() |
    (df.review_text.str.strip() == "")
)

print('DF preview: ')
print(df.head())
print()

print('bad row(s): ')
print(df[bad_mask])
print()

# Save bad rows separately (optional)
df[bad_mask].to_json(
    "./man_cleaned/old_bad_rows.jsonl",
    orient="records",
    lines=True
)

# Keep only good rows
df_clean = df[~bad_mask].reset_index(drop=True)

# Save final cleaned dataset
df_clean.to_json(
    OUTPUT,
    orient="records",
    lines=True
)

df = pd.read_json(OUTPUT, lines=True)
print('Final DF Check: ')
print(df.head())
print()
print()

print("Final rows:", len(df_clean))
print("Deleted rows:", bad_mask.sum())
print()


DF preview: 
      username       date                                        review_text
0        3rian 2025-12-21  I was still in a sci-fi mood after my last rea...
1       katy ♡ 2025-12-21  i loved this. murderbot is just so cutie and f...
2        namey 2025-12-21  felt like a draft, boring plot, somewhat fun a...
3        Keila 2025-12-21  My only peeve is that this book isn't any long...
4  Lynda Kelly 2025-12-20  In my journey to find sci-fi I don’t hate, thi...

bad row(s): 
              username date                                       review_text
30  What do you think?  NaT  What do you think? Rate this book Write a Review

Final DF Check: 
      username       date                                        review_text
0        3rian 2025-12-21  I was still in a sci-fi mood after my last rea...
1       katy ♡ 2025-12-21  i loved this. murderbot is just so cutie and f...
2        namey 2025-12-21  felt like a draft, boring plot, somewhat fun a...
3        Keila 2025-12-21  My

In [5]:
df

Unnamed: 0,username,date,review_text
0,3rian,2025-12-21,I was still in a sci-fi mood after my last rea...
1,katy ♡,2025-12-21,i loved this. murderbot is just so cutie and f...
2,namey,2025-12-21,"felt like a draft, boring plot, somewhat fun a..."
3,Keila,2025-12-21,My only peeve is that this book isn't any long...
4,Lynda Kelly,2025-12-20,"In my journey to find sci-fi I don’t hate, thi..."
...,...,...,...
1825,devin,2025-09-03,murderbot is one of the single most relatable ...
1826,Amy Danner,2025-09-03,The show on the other hand is very amusing. Wo...
1827,Lazu,2025-09-03,"Fun quick read, I enjoyed it!"
1828,Connor Vera,2025-09-03,Really fun! I now need to get my hands on the ...


In [6]:
INPUT = Path("./../att_3/seems-good/reviews_oldest_2000ish.jsonl")
OUTPUT = Path("./man_cleaned/reviews_oldest_clean.jsonl")

bad = 0
with INPUT.open() as fin, OUTPUT.open("w") as fout:
    for line in fin:
        entry = json.loads(line)
        cleaned = parse_entry(entry)

        # Count only true failures
        if (
            not cleaned
            or cleaned["username"] is None
            or cleaned["date"] is None
            or cleaned["review_text"] is None
            or cleaned["review_text"].strip() == ""
        ):
            bad += 1

        fout.write(json.dumps(cleaned, ensure_ascii=False) + "\n")

print("Done. Problem rows:", bad)

Done. Problem rows: 1


In [7]:
df = pd.read_json(OUTPUT, lines=True)

# Identify truly bad rows
bad_mask = (
    df.date.isna() |
    df.review_text.isna() |
    (df.review_text.str.strip() == "")
)

print('DF preview: ')
print(df.head())
print()

print('bad row(s): ')
print(df[bad_mask])
print()

# Save bad rows separately (optional)
df[bad_mask].to_json(
    "./man_cleaned/new_bad_rows.jsonl",
    orient="records",
    lines=True
)

# Keep only good rows
df_clean = df[~bad_mask].reset_index(drop=True)

# Save final cleaned dataset
df_clean.to_json(
    OUTPUT,
    orient="records",
    lines=True
)

df = pd.read_json(OUTPUT, lines=True)
print('Final DF Check: ')
print(df.head())
print()
print()

print("Final rows:", len(df_clean))
print("Deleted rows:", bad_mask.sum())
print()

DF preview: 
          username       date  \
0          Jessica 2024-02-22   
1            Silea 2017-05-03   
2         Esteefee 2017-05-03   
3           Ginger 2018-09-02   
4  Garrison Nelson 2017-05-03   

                                         review_text  
0  A great read -- an interesting concept well-ex...  
1  Fast, fun, snarky. Eagerly awaiting Murderbot ...  
2  Murderbot is so incredibly endearing -- I didn...  
3  very satisfying but not nearly long enough! I ...  
4  Murderbot aint too shabby! I truly enjoyed thi...  

bad row(s): 
              username date                                       review_text
30  What do you think?  NaT  What do you think? Rate this book Write a Review

Final DF Check: 
          username       date  \
0          Jessica 2024-02-22   
1            Silea 2017-05-03   
2         Esteefee 2017-05-03   
3           Ginger 2018-09-02   
4  Garrison Nelson 2017-05-03   

                                         review_text  
0  A great read 