# Set Up for Project Imports


In [None]:
#import sys
#from pathlib import Path

In [None]:
#source_directory = Path.cwd()
#ROOT = source_directory.parent
#if str(ROOT) not in sys.path:
#    sys.path.insert(0, str(ROOT))

In [None]:
## Auto-reload code changes
#%load_ext autoreload
#%autoreload 2

# Imports

In [None]:
from warnings import catch_warnings

from data.api import UcIrvineAPI, UcIrvineDatasetIDs, BureauEconomicAnalysisAPI
import pandas
import json

In [None]:
pandas.set_option('display.max_colwidth', None)  # show all text in cells
#pandas.set_option("display.max_rows", 100_000)
pandas.options.mode.copy_on_write = True

# UcIrvine Data

In [None]:
uci = UcIrvineAPI.fetch_dataset(repo_id=UcIrvineDatasetIDs.Apartment_For_Rent_Classified.value)

In [None]:
uci_df:pandas.DataFrame = uci.data.original.reset_index()

In [None]:
#TODO MOVE BACK TO WRANGLINGS_UTILS.PY
import re


class BadDataException(ValueError, TypeError):
    """Raised when int(x) fails; behaves like both ValueError and TypeError."""

    def __init__(self, value, message=None):
        self.value = value
        super().__init__(message)


BAD_DATA = {
    "id": [],
    "category": [],
    "title": [],
    "body": [],
    "amenities": [],
    "bathrooms": [],
    "bedrooms": [],
    "currency": [],
    "fee": [],
    "has_photo": [],
    "pets_allowed": [],
    "price": [],
    "price_display": [],
    "price_type": [],
    "square_feet": [],
    "address": [],
    "cityname": [],
    "state": [],
    "latitude": [],
    "longitude": [],
    "source": [],
    "time": []
}

# Common state/territory abbreviations to exclude when they appear alone as the "city"
US_STATE_ABBR = {
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
    "MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
    "VT","VA","WA","WV","WI","WY","DC","PR","VI","GU","AS","MP"
}

# Patterns we consider invalid for a city field
URL_PAT       = re.compile(r"(?:https?://|www\.|\.(?:com|net|org|edu|gov|io|co|us)\b)", re.I)
COORD_PAIR    = re.compile(r"^\s*-?\d+(?:\.\d+)?\s*[, ]\s*-?\d+(?:\.\d+)?\s*$")  # "40.7, -73.9" or "40.7 -73.9"
NUM_ONLY      = re.compile(r"^\s*-?\d+(?:\.\d+)?\s*$")
ALLOWED_CHARS = re.compile(r"^[A-Za-z .'\-]+$")  # letters, spaces, ., apostrophe, hyphen

# Expand common city-name abbreviations when they start the city token
ABBREV_MAP = {
    "st": "saint", "st.": "saint",
    "ft": "fort",  "ft.": "fort",
    "mt": "mount", "mt.": "mount",
}

def _expand_leading_abbrev(s: str) -> str:
    # Expand only if the *first* token is an abbreviation (e.g., "St Louis" -> "Saint Louis")
    tokens = s.split()
    if not tokens:
        return s
    first = tokens[0].lower().strip(".,")
    if first in ABBREV_MAP:
        tokens[0] = ABBREV_MAP[first]
    return " ".join(tokens)

def clean_id(x):
    try:
        try:
            return int(x)
        except Exception:
            raise BadDataException(x)
    except BadDataException as e:
        BAD_DATA["id"].append(e.value)
        return None
    except Exception:
        BAD_DATA["id"].append(x)
        return None

def clean_category(x):
    try:
        x = str(x).lower().strip()
        parts = x.split('/')

        cleaned_parts = [
            re.sub(r'^(ousing|ing)', 'housing', p.strip())
            for p in parts
        ]

        if any(p == '2' for p in cleaned_parts):
            raise BadDataException(cleaned_parts)

        return cleaned_parts

    except BadDataException as e:
        BAD_DATA["category"].append(e.value)
        return None
    except Exception:
        BAD_DATA["category"].append(x)
        return None

def clean_title(x):
    try:
        return str(x)
    except BadDataException as e:
        BAD_DATA["title"].append(e.value)
        return None
    except Exception:
        BAD_DATA["title"].append(x)
        return None


def clean_body(x):
    try:
        return str(x)
    except BadDataException as e:
        BAD_DATA["body"].append(e.value)
        return None


def clean_amenities(x):
    try:
        # Convert to string and lowercase
        x = str(x).lower().strip()

        # Split first by '/', then flatten any comma-separated pieces
        raw_parts = []
        for part in x.split('/'):
            raw_parts.extend(part.split(','))

        # Clean and filter empty values
        cleaned_parts = [p.strip() for p in raw_parts if p.strip()]

        if any(p == 'nan' for p in cleaned_parts):
            raise BadDataException(cleaned_parts, 'nan')

        return cleaned_parts

    except BadDataException as e:
        BAD_DATA["amenities"].append(e.value)
        return None
    except Exception:
        BAD_DATA["amenities"].append(x)
        return None

def clean_bathrooms(x):
    try:
        # Normalize to lowercase string
        val = str(x).strip().lower()

        # If the value is invalid, raise your custom exception
        if val in {"nan", "no", "thumbnail"}:
            raise BadDataException(x)

        # Try converting to integer
        return int(float(val))  # handles '2.0' etc.

    except BadDataException as e:
        BAD_DATA["bathrooms"].append(e.value)
        return None
    except Exception:
        BAD_DATA["bathrooms"].append(x)
        return None

def clean_bedrooms(x):
    try:
        # Normalize value to lowercase string
        val = str(x).strip().lower()

        # Raise custom exception for clearly invalid values
        if val in {"nan", "no", "thumbnail", "cats,dogs"}:
            raise BadDataException(x)

        # Attempt numeric conversion (handles "2.0" etc.)
        return int(float(val))

    except BadDataException as e:
        BAD_DATA["bedrooms"].append(e.value)
        return None
    except Exception:
        BAD_DATA["bedrooms"].append(x)
        return None

def clean_currency(x):
    try:
        # Normalize value to lowercase string
        val = str(x).strip().upper()  # currency codes are uppercase by convention

        # Define acceptable currency codes
        valid_currencies = {"USD"}

        # Raise if not valid
        if val not in valid_currencies:
            raise BadDataException(x)

        return val

    except BadDataException as e:
        BAD_DATA["currency"].append(e.value)
        return None
    except Exception:
        BAD_DATA["currency"].append(x)
        return None

def clean_fee(x):
    try:
        # Normalize value
        val = str(x).strip().lower()

        # Map valid values
        if val == "yes":
            return True
        elif val == "no":
            return False

        raise BadDataException(x)

    except BadDataException as e:
        BAD_DATA["fee"].append(e.value)
        return None

    except Exception:
        BAD_DATA["fee"].append(x)
        return None


def clean_has_photo(x):
    try:
        # Normalize value
        val = str(x).strip().lower()

        # Map known valid values
        if val in {"yes", "thumbnail"}:
            return True
        elif val == "no":
            return False

        # Anything else is bad data
        raise BadDataException(x)

    except BadDataException as e:
        BAD_DATA["has_photo"].append(e.value)
        return None
    except Exception:
        BAD_DATA["has_photo"].append(x)
        return None

def clean_pets_allowed(x):
    try:
        val = str(x).strip().lower()

        # Handle truly missing or numeric data (bad)
        if val == "nan" or val.isnumeric():
            raise BadDataException(x)

        # Split on commas or slashes
        tokens = [t.strip() for t in re.split(r"[,/]", val) if t.strip()]

        has_cats = any(t == "cats" for t in tokens)
        has_dogs = any(t == "dogs" for t in tokens)
        has_none = any(t == "none" for t in tokens)

        # Determine clean category
        if has_cats and has_dogs:
            return "Cats&Dogs"
        if has_cats:
            return "Cats"
        if has_dogs:
            return "Dogs"
        if has_none:
            return "X"  # ‚Üê keep as string 'None', not Python None


    except BadDataException as e:
        BAD_DATA["pets_allowed"].append(e.value)
        return None
    except Exception:
        BAD_DATA["pets_allowed"].append(x)
        return None

def clean_price(x):
    try:
        return float(x)
    except BadDataException as e:
        BAD_DATA["price"].append(e.value)
        return None
    except Exception:
        BAD_DATA["price"].append(x)
        return None

def clean_price_display(x):
    try:
        if x is None:
            print(f"found None {x}")
            raise BadDataException(x)

        val = str(x).strip()

        # --- Detect "Weekly" / "Monthly" and print ---
        if re.search(r'\b(weekly|monthly)\b', val, flags=re.IGNORECASE):
            pass
            # print(f"found recurring term {x}")

        # --- Remove $ signs, commas, and spaces ---
        val = val.replace("$", "").replace(",", "").strip()

        # --- Extract numeric parts ---
        range_match = re.findall(r"[\d.]+", val)
        if len(range_match) == 0:
            #print(f"found non numeric term {x}")
            raise BadDataException(x)
        elif len(range_match) == 1:
            return float(range_match[0])
        else:
            #print(f"averaging terms {range_match=}")
            nums = [float(v) for v in range_match]
            return sum(nums) / len(nums)

    except BadDataException as e:
        BAD_DATA["price_display"].append(e.value)
        return None
    except Exception:
        BAD_DATA["price_display"].append(x)
        return None

def clean_price_type(x):
    try:
        val = str(x).strip().lower()

        # Valid categories
        if "monthly" in val:
            return "monthly"
        elif "weekly" in val:
            return "weekly"

        #print(f"found {x}")
        raise BadDataException(x, " i cant intepret")

    except BadDataException as e:
        BAD_DATA["price_type"].append(e.value)
        return None
    except Exception:
        BAD_DATA["price_type"].append(x)
        return None

def clean_square_feet(x):
    try:
        # Treat None and float('nan') as invalid
        if x is None:
            raise BadDataException(x)

        val = str(x).strip()

        # Detect values that contain only numbers or a decimal
        match = re.fullmatch(r"\d+(?:\.\d+)?", val)
        if match:
            return float(val)

        #print(f"found {x}")
        raise BadDataException(x)

    except BadDataException as e:
        BAD_DATA["square_feet"].append(e.value)
        return None
    except Exception:
        BAD_DATA["square_feet"].append(x)
        return None

def clean_address(x):
    try:
        #TODO
        # --- Handle missing / NaN values ---
        if x is None:
            raise BadDataException(x)

        val = str(x).strip()

        # --- Handle explicit invalid tokens ---
        if val.lower() in {"", "none", "nan"}:
            raise BadDataException(val)

        # --- Reject coordinate-like values (e.g., "40.2659 -77.4948") ---
        if re.fullmatch(r"^-?\d+(\.\d+)?\s*[,\s]\s*-?\d+(\.\d+)?$", val):
            raise BadDataException(val)

        # --- Reject numeric-only or "square feet" type values ---
        if re.fullmatch(r"[\d., ]+$", val) or "square" in val.lower() or "sq" in val.lower():
            raise BadDataException(val)

        # --- Basic address sanity check: must contain both a number and a letter ---
        if not (re.search(r"\d", val) and re.search(r"[A-Za-z]", val)):
            raise BadDataException(val)

        # --- Normalize whitespace and punctuation ---
        cleaned = re.sub(r"\s+", " ", val).strip(" ,.;-")

        return cleaned

    except BadDataException as e:
        BAD_DATA["address"].append(e.value)
        return None
    except Exception:
        BAD_DATA["address"].append(x)
        return None

def clean_city_name(x):
    try:
        # Missing/NaN
        if x is None:
            raise BadDataException(None)

        raw = str(x)
        s = raw.strip()
        if s == "" or s.lower() in {"nan", "none", "null", "n/a"}:
            raise BadDataException(raw)

        # Hard rejections: urls/domains, coordinates, pure numbers, pure state codes
        if URL_PAT.search(s):
            raise BadDataException(raw)
        if COORD_PAIR.match(s) or NUM_ONLY.match(s):
            raise BadDataException(raw)
        if s.upper() in US_STATE_ABBR:
            raise BadDataException(raw)

        # Reject if it contains digits (city names should not contain numerals)
        if any(ch.isdigit() for ch in s):
            raise BadDataException(raw)

        # Keep only plausible characters
        if not ALLOWED_CHARS.match(s):
            raise BadDataException(raw)

        # Normalize internal whitespace & punctuation spacing
        s = re.sub(r"\s+", " ", s).strip(" ,.;-")

        # Expand leading abbreviations like St./Ft./Mt.
        s = _expand_leading_abbrev(s)

        # Title-case: keep nice formatting like "Saint Paul", "Fort Worth", "O'Fallon", "Coeur d'Alene"
        s = " ".join(
            part if part.isupper() and len(part) <= 3  # keep short all-caps (e.g., "DC") as-is if they appear
            else part.capitalize()
            for part in re.split(r"(\s+)", s)          # preserve spacing while capitalizing tokens
        )

        # Final sanity: must contain letters and at least 2 characters
        if not re.search(r"[A-Za-z]", s) or len(s) < 2:
            raise BadDataException(raw)

        return s

    except BadDataException as e:
        BAD_DATA["cityname"].append(e.value)
        return None
    except Exception:
        BAD_DATA["cityname"].append(x)
        return None

def clean_state(x):
    #TODO:
    try:
        print(x)
        return str(x)
    except BadDataException as e:
        BAD_DATA["state"].append(e.value)
        return None


def clean_latitude(x):
    #TODO:
    try:
        print(x)
        return float(x)
    except BadDataException as e:
        BAD_DATA["latitude"].append(e.value)
        return None


def clean_longitude(x):
    #TODO:
    try:
        print(x)
        return float(x)
    except BadDataException as e:
        BAD_DATA["longitude"].append(e.value)
        return None


def clean_source(x):
    #TODO:
    try:
        print(x)
        return str(x)
    except BadDataException as e:
        BAD_DATA["source"].append(e.value)
        return None


def clean_time(x):
    #TODO:
    try:
        print(x)
        return str(x)
    except BadDataException as e:
        BAD_DATA["time"].append(e.value)
        return None


cleaned_uci_df = pandas.DataFrame()
cleaned_uci_df['id'] = uci_df['id'].apply(clean_id)
cleaned_uci_df['category'] = uci_df['category'].apply(clean_category)
cleaned_uci_df['title'] = uci_df['title']  #.apply(clean_title)
cleaned_uci_df['body'] = uci_df['body']  #.apply(clean_body)
cleaned_uci_df['amenities'] = uci_df['amenities'].apply(clean_amenities)
cleaned_uci_df['bathrooms'] = uci_df['bathrooms'].apply(clean_bathrooms)
cleaned_uci_df['bedrooms'] = uci_df['bedrooms'].apply(clean_bedrooms)
cleaned_uci_df['currency'] = uci_df['currency'].apply(clean_currency)
cleaned_uci_df['fee'] = uci_df['fee'].apply(clean_fee)
cleaned_uci_df['has_photo'] = uci_df['has_photo'].apply(clean_has_photo)
cleaned_uci_df['pets_allowed'] = uci_df['pets_allowed'].apply(clean_pets_allowed)
cleaned_uci_df['price'] = uci_df['price'].apply(clean_price)
cleaned_uci_df['price_display'] = uci_df['price_display'].apply(clean_price_display)
cleaned_uci_df['price_type'] = uci_df['price_type'].apply(clean_price_type)
cleaned_uci_df['square_feet'] = uci_df['square_feet'].apply(clean_square_feet)
#cleaned_uci_df['address'] = uci_df['address'].apply(clean_address)
cleaned_uci_df['cityname'] = uci_df['cityname'].apply(clean_city_name)
# cleaned_uci_df['state'] = uci_df['state'].apply(clean_state)
# cleaned_uci_df['latitude'] = uci_df['latitude'].apply(clean_latitude)
# cleaned_uci_df['longitude'] = uci_df['longitude'].apply(clean_longitude)
# cleaned_uci_df['source'] = uci_df['source'].apply(clean_source)
# cleaned_uci_df['time'] = uci_df['time'].apply(clean_time)
#cleaned_uci_df

In [None]:
pandas.set_option("display.max_rows", 100_000) # TOGGLE  UN/COMMENT
#pandas.reset_option("display.max_rows") # TOGGLE UN/COMMENT
cleaned_uci_df['cityname'].value_counts(dropna=False)# change column

In [None]:
#DONT DELETE MIGHT NEED
#s = cleaned_uci_df['bathrooms'].explode()
#global_counts = s.value_counts().to_dict()
#global_counts

In [None]:
print(f'BAD_DATA: {json.dumps(BAD_DATA['cityname'], indent=2)}') # CHANGE COL

In [None]:
from data.wrangling_utils import STATE_MAP

uci_df["state_full"] = uci_df["state"].str.upper().map(STATE_MAP)
print(uci_df.shape)
uci_df.dropna(subset=["state_full"], inplace=True)
uci_df.shape

# Bureau of Economic Data

In [None]:
bea_df = BureauEconomicAnalysisAPI.fetch_dataset('Regional', GeoFips='STATE', TableName='SARPP', Year='2019',
                                                 LineCode='1')

In [None]:
# Merge Data

In [None]:
merged = pandas.merge(
    uci_df,
    bea_df,
    left_on="state_full",
    right_on="GeoName",
    how="inner"
)
merged.shape