In [1]:
import pandas as pd
import usaddress
import re
import logging

# Configure logger once
logger = logging.getLogger("address_parser")
logger.setLevel(logging.INFO)
handler = logging.FileHandler("parse_failures.log")
logger.addHandler(handler)


First thing before starting to work on model is data understanding and manupilating so let dive into our data

In [4]:
#__________________Loading data________________________
prop=pd.read_csv("properties-out.csv")
listing=pd.read_csv("listings-out.csv")

print("Shape of properties data", prop.shape, " and the shape or Listing is ", listing.shape)

Shape of properties data (5000, 9)  and the shape or Listing is  (7142, 3)


# Data Analyizing

In [7]:
prop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            5000 non-null   object
 1   street_address  5000 non-null   object
 2   unit            2002 non-null   object
 3   city            5000 non-null   object
 4   state           5000 non-null   object
 5   zipcode         5000 non-null   object
 6   type            3842 non-null   object
 7   property_id     5000 non-null   object
 8   team_id         5000 non-null   object
dtypes: object(9)
memory usage: 351.7+ KB


Looks like the Unit number has the most missing values followed by Type of property. specially with Unit number that is missing with about 60% o the time it cannot be treated as reliable feature and also similarly the type cannot be reliable for matching
now let do some data preprocessing to enhance our data quality

In [10]:
display(prop.describe(include="all").T)

Unnamed: 0,count,unique,top,freq
name,5000,4998,Meadow Wood Apartments,2
street_address,5000,3583,948 East Devonshire Avenue,62
unit,2002,950,1,46
city,5000,304,Los Angeles,557
state,5000,25,AL,1031
zipcode,5000,550,84096,145
type,3842,8,Apartment,1661
property_id,5000,5000,3d283cd9-5436-40ee-b3c5-8d1983f65bac,1
team_id,5000,47,5c7aff2f-1a49-4dd3-a1fa-a7622f260133,1267


Showing that property ID can be unique number that helps us to match them. so let check the same thing in listing file

In [13]:
listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7142 entries, 0 to 7141
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         7142 non-null   object
 1   property_id  4911 non-null   object
 2   team_id      7142 non-null   object
dtypes: object(3)
memory usage: 167.5+ KB


Here with 7141 team id we have less that 5000 property id which seems that we need have additional feature

In [16]:
display(listing.describe(include="all").T)

Unnamed: 0,count,unique,top,freq
name,7142,4230,Rock Hound Way,20
property_id,4911,1376,83c06a45-751c-4459-ad6c-dece5b0ffdec,160
team_id,7142,40,9d7b2296-f579-464b-a47d-d62b7532a9e5,2056


With seing this this describe i feel like that we may have a duplocation as only 1376 of property id are unique. also in property file we had 47 Unique teams but in listing we have 40 unique teams \n
I am not going to do any processing on listing data as i want to keep it in way that it can be used as our Vector database for having it in our database. so let process our property data.

# Processing

## 1.Type normalizing
Let see what are the different type we have

In [21]:
prop["type"].value_counts(dropna=False)

type
Apartment               1661
House                   1638
NaN                     1158
Condo                    274
Townhouse                198
Apartment Building        65
Apartment Floor Plan       3
Room                       2
Community                  1
Name: count, dtype: int64

In [23]:
mapping = {
    "Apartment Building": "Apartment",
    "Apartment Floor Plan": "Apartment",
}
prop["type"]=prop["type"].replace(mapping)

## 2. Adress Normalizing
I rather do this for both property and listing

In [36]:
# --- Enhanced address parsing with normalization and logging ---
# Abbreviation map for consistent expansion
ABBREVIATIONS = {
    "st": "street", "ave": "avenue", "rd": "road", "blvd": "boulevard",
    "dr": "drive", "n": "north", "s": "south", "e": "east", "w": "west"
}

def normalize_string(s: str) -> str:
    """Lowercase, strip punctuation, expand abbreviations, collapse whitespace."""
    s = s.lower()
    s = re.sub(r"[.,#]", "", s)
    tokens = s.split()
    tokens = [ABBREVIATIONS.get(tok, tok) for tok in tokens]
    return " ".join(tokens).strip()

def enhanced_parse(addr: str) -> dict:
    """Parse address with usaddress, logging failures and falling back to {}."""
    raw = addr
    norm = normalize_string(addr)
    try:
        parsed, _ = usaddress.tag(norm)
    except usaddress.RepeatedLabelError as e:
        logger.info(f"RepeatedLabelError for '{raw}': {e}")
        return {}
    except Exception as e:
        logger.info(f"Parse failure for '{raw}': {e}")
        return {}
    return parsed


In [38]:
# --- Property address processing ---
# Apply enhanced parsing
prop['parsed_addr'] = prop['street_address'].fillna('').apply(enhanced_parse)

# Construct full_address with fallback to raw concatenation
def build_prop_full(row):
    if row['parsed_addr']:
        return " ".join([
            row['parsed_addr'].get('AddressNumber',''),
            row['parsed_addr'].get('StreetName',''),
            row['parsed_addr'].get('StreetNamePostType',''),
            row['city'], row['state'], str(row['zipcode'])
        ]).strip()
    else:
        return f"{row['street_address']} {row['city']} {row['state']} {row['zipcode']}"

prop['full_address'] = prop.apply(build_prop_full, axis=1)

# Token set feature for fuzzy matching
prop['token_set'] = prop['full_address'].str.split().apply(set)

# Normalize property type
type_map = {
    'Apartment Building':'Apartment',
    'Apartment Floor Plan':'Apartment'
}
prop['type_norm'] = prop['type'].map(type_map).fillna(prop['type'])

# Cleaned property frame
prop_clean = prop[['team_id','property_id','full_address','token_set','type_norm']]
prop_clean.head()


In [40]:
# --- Listing address processing ---
# Enhanced parsing on listing name
listing['parsed_addr'] = listing['name'].fillna('').apply(enhanced_parse)

def build_listing_full(row):
    if row['parsed_addr']:
        return " ".join([
            row['parsed_addr'].get('AddressNumber',''),
            row['parsed_addr'].get('StreetName',''),
            row['parsed_addr'].get('StreetNamePostType',''),
            row['parsed_addr'].get('PlaceName',''),
            row['parsed_addr'].get('StateName',''),
            row['parsed_addr'].get('OccupancyIdentifier','')
        ]).strip()
    else:
        return normalize_string(row['name'])

listing['full_address'] = listing.apply(build_listing_full, axis=1)

# Token set feature
listing['token_set'] = listing['full_address'].str.split().apply(set)

clean_listing = listing[['team_id','property_id','full_address','token_set']]
clean_listing.head()


In [42]:
display(prop_clean.describe(include="all").T)

Unnamed: 0,count,unique,top,freq
team_id,5000,47,5c7aff2f-1a49-4dd3-a1fa-a7622f260133,1267
property_id,5000,5000,3d283cd9-5436-40ee-b3c5-8d1983f65bac,1
full_address,5000,4942,3307 3307 Kansas City KS,7


In [46]:
display(clean_listing.describe(include="all").T)

Unnamed: 0,count,unique,top,freq
team_id,7142,40,9d7b2296-f579-464b-a47d-d62b7532a9e5,2056
property_id,4911,1376,83c06a45-751c-4459-ad6c-dece5b0ffdec,160
full_address,7142,3221,,190


In [48]:
prop_clean.to_csv("properties_cleaned.csv")

In [50]:
clean_listing.to_csv("listings_clean.csv")