# 01 - Data Cleaning and Parsing

In [6]:
import os
import json
import pandas as pd
from datetime import datetime
from dateutil import parser
import re
import numpy as np

## Load JSON dream files from disk

In [7]:
from google.colab import drive
drive.mount('/content/drive')
json_dir = "/content/drive/MyDrive/dreams" # Adjust this path as needed

dreams = []
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        with open(os.path.join(json_dir, filename), "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                dreamer = data.get("dreamer", "unknown")
                description = data.get("description", "")
                for entry in data.get("dreams", []):
                    dreams.append({
                        "dream_id": f"{dreamer}_{entry.get('number')}",
                        "dreamer": dreamer,
                        "number": entry.get("number"),
                        "description": description,
                        "date_raw": entry.get("head"),
                        "content": entry.get("content")
                    })
            except Exception as e:
                print(f"Error parsing {filename}: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Convert to DataFrame and clean the dream text

In [8]:
df = pd.DataFrame(dreams)

def clean_text(text):
    return re.sub(r"\s+", " ", text).strip()

df["text_clean"] = df["content"].fillna("").apply(clean_text)


In [9]:
def force_1900s(date_str):
    # If two-digit year, assume 1900s
    if isinstance(date_str, str):
        match = re.search(r"\b(\d{1,2})[/-](\d{1,2})[/-](\d{2})\b", date_str)
        if match:
            day, month, year = match.groups()
            if int(year) >= 20:  # 20–99 → 1920–1999
                date_str = re.sub(r"(\d{1,2}[/-]\d{1,2}[/-])(\d{2})\b", r"\g<1>19\2", date_str)
            else:  # 00–19 → 2000–2019
                date_str = re.sub(r"(\d{1,2}[/-]\d{1,2}[/-])(\d{2})\b", r"\g<1>20\2", date_str)
    return date_str

def parse_date_flexibly(date_str):
    if pd.isna(date_str) or not str(date_str).strip():
        return pd.NaT
    try:
        adjusted = force_1900s(date_str)
        return parser.parse(adjusted, fuzzy=True, default=pd.Timestamp("1900-01-01"))
    except Exception:
        return pd.NaT

def extract_year_if_possible(date_str):
    match = re.search(r"(19|20)\d{2}", str(date_str))
    if match:
        return int(match.group(0))
    return np.nan



df["date_parsed"] = df["date_raw"].apply(parse_date_flexibly)
df["year"] = df["date_raw"].apply(extract_year_if_possible)


## Save processed data

In [10]:
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/cleaned_dreams.csv", index=False)

df.head()

Unnamed: 0,dream_id,dreamer,number,description,date_raw,content,text_clean,date_parsed,year
0,b_0000,b,0,Barb Sanders,05/03/60,"I had the neatest dream about Blake, me, Reta ...","I had the neatest dream about Blake, me, Reta ...",1960-05-03 00:00:00,
1,b_0001,b,1,Barb Sanders,05/04/60,I had another neat dream about Blake.,I had another neat dream about Blake.,1960-05-04 00:00:00,
2,b_0002,b,2,Barb Sanders,07/16/60,I had a dream that Nate came back and I felt j...,I had a dream that Nate came back and I felt j...,1960-07-16 00:00:00,
3,b_0003,b,3,Barb Sanders,08/04/60,"For the second night in a row, I dreamed of Jo...","For the second night in a row, I dreamed of Jo...",1960-08-04 00:00:00,
4,b_0004,b,4,Barb Sanders,12/02/60,I Didn't dream as last night before I woke up!...,I Didn't dream as last night before I woke up!...,1960-12-02 00:00:00,
