# Set Up for Project Imports


In [None]:
#import sys
#from pathlib import Path

In [None]:
#source_directory = Path.cwd()
#ROOT = source_directory.parent
#if str(ROOT) not in sys.path:
#    sys.path.insert(0, str(ROOT))

In [None]:
## Auto-reload code changes
#%load_ext autoreload
#%autoreload 2

# Imports

In [None]:
from warnings import catch_warnings

from data.api import UcIrvineAPI, UcIrvineDatasetIDs, BureauEconomicAnalysisAPI
import pandas
import json

In [None]:
pandas.set_option('display.max_colwidth', None)  # show all text in cells
#pandas.set_option("display.max_rows", 100_000)
pandas.options.mode.copy_on_write = True
# Suppress scientific notation globally
pandas.set_option('display.float_format', lambda x: '%.2f' % x)

# UcIrvine Data

In [None]:
uci = UcIrvineAPI.fetch_dataset(repo_id=UcIrvineDatasetIDs.Apartment_For_Rent_Classified.value)

In [None]:
uci_df:pandas.DataFrame = uci.data.original.reset_index()

In [None]:
#TODO MOVE BACK TO WRANGLINGS_UTILS.PY
import re
from data.wrangling_utils import STATE_MAP

class BadDataException(ValueError, TypeError):
    """Raised when int(x) fails; behaves like both ValueError and TypeError."""

    def __init__(self, value, message=None):
        self.value = value
        super().__init__(message)


BAD_DATA = {
    "id": [],
    "category": [],
    "title": [],
    "body": [],
    "amenities": [],
    "bathrooms": [],
    "bedrooms": [],
    "currency": [],
    "fee": [],
    "has_photo": [],
    "pets_allowed": [],
    "price": [],
    "price_display": [],
    "price_type": [],
    "square_feet": [],
    "address": [],
    "cityname": [],
    "state": [],
    "latitude": [],
    "longitude": [],
    "source": [],
    "time": []
}


def clean_id(x):
    try:
        try:
            return int(x)
        except Exception:
            raise BadDataException(x)
    except BadDataException as e:
        BAD_DATA["id"].append(e.value)
        return None


def clean_category(x):
    try:
        x = str(x).lower().strip()
        parts = x.split('/')

        cleaned_parts = [
            re.sub(r'^(ousing|ing)', 'housing', p.strip())
            for p in parts
        ]

        if any(p == '2' for p in cleaned_parts):
            raise BadDataException(cleaned_parts, 'dont know what 2 is')

        return cleaned_parts

    except BadDataException as e:
        BAD_DATA["category"].append(e.value)
        return None


def clean_title(x):
    try:
        return str(x)
    except BadDataException as e:
        BAD_DATA["title"].append(e.value)
        return None


def clean_body(x):
    try:
        return str(x)
    except BadDataException as e:
        BAD_DATA["body"].append(e.value)
        return None


def clean_amenities(x):
    try:
        # Convert to string and lowercase
        x = str(x).lower().strip()

        # Split first by '/', then flatten any comma-separated pieces
        raw_parts = []
        for part in x.split('/'):
            raw_parts.extend(part.split(','))

        # Clean and filter empty values
        cleaned_parts = [p.strip() for p in raw_parts if p.strip()]

        if any(p == 'nan' for p in cleaned_parts):
            raise BadDataException(cleaned_parts, 'nan')

        return cleaned_parts

    except BadDataException as e:
        BAD_DATA["amenities"].append(e.value)
        return None


def clean_bathrooms(x):
    try:
        # Normalize to lowercase string
        val = str(x).strip().lower()

        # If the value is invalid, raise your custom exception
        if val in {"nan", "no", "thumbnail"}:
            raise BadDataException(x)

        # Try converting to integer
        return int(float(val))  # handles '2.0' etc.

    except BadDataException as e:
        BAD_DATA["bathrooms"].append(e.value)
        return None


def clean_bedrooms(x):
    try:
        # Normalize value to lowercase string
        val = str(x).strip().lower()

        # Raise custom exception for clearly invalid values
        if val in {"nan", "no", "thumbnail", "cats,dogs"}:
            raise BadDataException(x)

        # Attempt numeric conversion (handles "2.0" etc.)
        return int(float(val))

    except BadDataException as e:
        BAD_DATA["bedrooms"].append(e.value)
        return None


def clean_currency(x):
    try:
        # Normalize value to lowercase string
        val = str(x).strip().upper()  # currency codes are uppercase by convention

        # Define acceptable currency codes
        valid_currencies = {"USD"}

        # Raise if not valid
        if val not in valid_currencies:
            raise BadDataException(x)

        return val

    except BadDataException as e:
        BAD_DATA["currency"].append(e.value)
        return None


def clean_fee(x):
    try:
        # Normalize value
        val = str(x).strip().lower()

        # Map valid values
        if val == "yes":
            return True
        elif val == "no":
            return False

        raise BadDataException(x)

    except BadDataException as e:
        BAD_DATA["fee"].append(e.value)
        return None

    except Exception:
        BAD_DATA["fee"].append(x)
        return None


def clean_has_photo(x):
    try:
        # Normalize value
        val = str(x).strip().lower()

        # Map known valid values
        if val in {"yes", "thumbnail"}:
            return True
        elif val == "no":
            return False

        # Anything else is bad data
        raise BadDataException(x)

    except BadDataException as e:
        BAD_DATA["has_photo"].append(e.value)
        return None


def clean_pets_allowed(x):
    try:
        val = str(x).strip().lower()

        # Handle truly missing or numeric data (bad)
        if val == "nan" or val.isnumeric():
            raise BadDataException(x)

        # Split on commas or slashes
        tokens = [t.strip() for t in re.split(r"[,/]", val) if t.strip()]

        has_cats = any(t == "cats" for t in tokens)
        has_dogs = any(t == "dogs" for t in tokens)
        has_none = any(t == "none" for t in tokens)

        # Determine clean category
        if has_cats and has_dogs:
            return "Cats&Dogs"
        if has_cats:
            return "Cats"
        if has_dogs:
            return "Dogs"
        if has_none:
            return "X"  # ‚Üê keep as string 'None', not Python None


    except BadDataException as e:
        BAD_DATA["pets_allowed"].append(e.value)
        return None


def clean_price(x):
    try:
        return float(x)
    except BadDataException as e:
        BAD_DATA["price"].append(e.value)
        return None


def clean_price_display(x):
    #TODO
    try:
        return float(x)
    except BadDataException as e:
        BAD_DATA["price_display"].append(e.value)
        return None


def clean_price_type(x):
    #TODO
    try:
        print(x)
        return float(x)
    except BadDataException as e:
        BAD_DATA["price_type"].append(e.value)
        return None


def clean_square_feet(x):
    #TODO
    try:
        print(x)
        return float(x)
    except BadDataException as e:
        BAD_DATA["square_feet"].append(e.value)
        return None


def clean_address(x):
    #TODO
    try:
        print(x)
        return str(x)
    except BadDataException as e:
        BAD_DATA["address"].append(e.value)
        return None


def clean_city_name(x):
    #TODO:
    try:
        print(x)
        return str(x)
    except BadDataException as e:
        BAD_DATA["cityname"].append(e.value)
        return None


def clean_state(x):
    try:
        state_abbrev = str(x).strip().upper()
        if state_abbrev not in STATE_MAP:
            raise BadDataException(x, 'not a valid state')

        return STATE_MAP[state_abbrev]
    except BadDataException as e:
        BAD_DATA["state"].append(e.value)
        return None


def clean_latitude(x):
    try:
        latitude = float(x)
        out_of_range = latitude < -90.0 or latitude > 90.0

        if out_of_range:
            raise BadDataException(x, 'is out of range. Latitude values must be between -90 and 90 degrees.')

        return latitude
    except BadDataException as e:
        BAD_DATA["latitude"].append(e.value)
        return None


def clean_longitude(x):
    try:
        longitude = float(x)
        out_of_range = longitude < -180.0 or longitude > 180.0

        if out_of_range:
            raise BadDataException(x, 'is out of range. Longitude values must be between -180 and 180 degrees.')

        return float(x)
    except BadDataException as e:
        BAD_DATA["longitude"].append(e.value)
        return None


def clean_source(x):
    try:
        return str(x).strip().lower()
    except BadDataException as e:
        BAD_DATA["source"].append(e.value)
        return None


def clean_time(x):
    try:
        return str(x)
    except BadDataException as e:
        BAD_DATA["time"].append(e.value)
        return None


cleaned_uci_df = pandas.DataFrame()
cleaned_uci_df['id'] = uci_df['id'].apply(clean_id)
cleaned_uci_df['category'] = uci_df['category'].apply(clean_category)
cleaned_uci_df['title'] = uci_df['title']  #.apply(clean_title)
cleaned_uci_df['body'] = uci_df['body']  #.apply(clean_body)
cleaned_uci_df['amenities'] = uci_df['amenities'].apply(clean_amenities)
cleaned_uci_df['bathrooms'] = uci_df['bathrooms'].apply(clean_bathrooms)
cleaned_uci_df['bedrooms'] = uci_df['bedrooms'].apply(clean_bedrooms)
cleaned_uci_df['currency'] = uci_df['currency'].apply(clean_currency)
cleaned_uci_df['fee'] = uci_df['fee'].apply(clean_fee)
cleaned_uci_df['has_photo'] = uci_df['has_photo'].apply(clean_has_photo)
cleaned_uci_df['pets_allowed'] = uci_df['pets_allowed'].apply(clean_pets_allowed)
cleaned_uci_df['price'] = uci_df['price'].apply(clean_price)
#cleaned_uci_df['price_display'] = uci_df['price_display'].apply(clean_price_display)
# cleaned_uci_df['price_type'] = uci_df['price_type'].apply(clean_price_type)
# cleaned_uci_df['square_feet'] = uci_df['square_feet'].apply(clean_square_feet)
# cleaned_uci_df['address'] = uci_df['address'].apply(clean_address)
# cleaned_uci_df['cityname'] = uci_df['cityname'].apply(clean_city_name)
cleaned_uci_df['state'] = uci_df['state'].apply(clean_state)
cleaned_uci_df['latitude'] = uci_df['latitude'].apply(clean_latitude)
cleaned_uci_df['longitude'] = uci_df['longitude'].apply(clean_longitude)
cleaned_uci_df['source'] = uci_df['source'].apply(clean_source)
cleaned_uci_df['time'] = uci_df['time'].apply(clean_time)
cleaned_uci_df

In [None]:
uci_df['price_display'].value_counts(dropna=False)

In [None]:
pandas.set_option("display.max_rows", 100_000) # TOGGLE  UN/COMMENT
#pandas.reset_option("display.max_rows") # TOGGLE UN/COMMENT
cleaned_uci_df['price'].value_counts(dropna=False).sort_values() # change column

In [None]:
#DONT DELETE MIGHT NEED
#s = cleaned_uci_df['bathrooms'].explode()
#global_counts = s.value_counts().to_dict()
#global_counts

In [None]:
print(f'BAD_DATA: {json.dumps(BAD_DATA['price'], indent=2)}') # CHANGE COL

# Bureau of Economic Data

In [None]:
bea_df = BureauEconomicAnalysisAPI.fetch_dataset('Regional', GeoFips='STATE', TableName='SARPP', Year='2019',
                                                 LineCode='1')

In [None]:
# Merge Data

In [None]:
merged = pandas.merge(
    uci_df,
    bea_df,
    left_on="state_full",
    right_on="GeoName",
    how="inner"
)
merged.shape