## Data Exploration / Cleaning

In [3]:
import pandas as pd

# ============================
# 1. Load datasets
# ============================
train_df = pd.read_csv(r'F:\AI Projects\Regression Project\data\raw\train_data.csv')
eval_df = pd.read_csv(r'F:\AI Projects\Regression Project\data\raw\eval_data.csv')
metros = pd.read_csv(r'F:\AI Projects\Regression Project\data\raw\usmetros.csv')

pd.set_option('display.max_columns', None)  # to display all columns
pd.set_option('display.max_rows', None)     # to display all rows

In [4]:
print(train_df.shape)
print(eval_df.shape)

(585244, 39)
(149424, 39)


In [5]:
train_df.head(2)

Unnamed: 0,date,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,median_dom,avg_sale_to_list,sold_above_list,off_market_in_two_weeks,city,zipcode,year,bank,bus,hospital,mall,park,restaurant,school,station,supermarket,Total Population,Median Age,Per Capita Income,Total Families Below Poverty,Total Housing Units,Median Rent,Median Home Value,Total Labor Force,Unemployed Population,Total School Age Population,Total School Enrollment,Median Commute Time,price,city_full
0,2012-03-31,46550.0,217450.0,31.813674,110.183666,14.0,23.0,44.0,64.0,59.5,0.943662,0.142857,0.043478,ATL,30002,2012,12.0,2.0,4.0,1.0,60.0,45.0,57.0,4.0,7.0,5811.0,36.3,33052.0,5811.0,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,200773.999557,Atlanta-Sandy Springs-Alpharetta
1,2012-03-31,200000.0,7500.0,104.931794,79.265873,1.0,1.0,1.0,2.0,290.0,0.909091,0.0,0.0,PGH,15469,2012,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2441.0,41.8,20241.0,2385.0,1108.0,641.0,94600.0,1171.0,52.0,2376.0,2376.0,1018.0,105863.681174,Pittsburgh


In [6]:
train_df['city_full'].value_counts().head()

city_full
New York-Newark-Jersey City       78020
Chicago-Naperville-Elgin          35344
Los Angeles-Long Beach-Anaheim    33840
Philadelphia-Camden-Wilmington    31396
DC_Metro                          29516
Name: count, dtype: int64

### Map cities to Lat/Long
- The goal is to use Lattitude and longitude instead of cities for our ML models

In [7]:
# ============================
# 2. Fix city name mismatches
# ============================
city_mapping = {
    'Las Vegas-Henderson-Paradise': 'Las Vegas-Henderson-North Las Vegas',
    'Denver-Aurora-Lakewood': 'Denver-Aurora-Centennial',
    'Houston-The Woodlands-Sugar Land': 'Houston-Pasadena-The Woodlands',
    'Austin-Round Rock-Georgetown': 'Austin-Round Rock-San Marcos',
    'Miami-Fort Lauderdale-Pompano Beach': 'Miami-Fort Lauderdale-West Palm Beach',
    'San Francisco-Oakland-Berkeley': 'San Francisco-Oakland-Fremont',
    'DC_Metro': 'Washington-Arlington-Alexandria',
    'Atlanta-Sandy Springs-Alpharetta': 'Atlanta-Sandy Springs-Roswell'
}

In [8]:
def clean_and_merge(df: pd.DataFrame) -> pd.DataFrame:
    """Apply city name fixes, merge lat/lng from metros, drop dup col."""
    df["city_full"] = df["city_full"].replace(city_mapping)
    
    df = df.merge(
        metros[["metro_full", "lat", "lng"]],
        how="left",
        left_on="city_full",
        right_on="metro_full"
    )
    df.drop(columns=["metro_full"], inplace=True)

    # Log any cities that still didn’t match
    missing = df[df["lat"].isnull()]["city_full"].unique()
    if len(missing) > 0:
        print("⚠️ Still missing lat/lng for:", missing)
    else:
        print("✅ All cities matched with metros dataset.")

    return df


In [9]:
# ============================
# 3. Apply cleaning + merge to both train and eval
# ============================
train_df = clean_and_merge(train_df)
eval_df = clean_and_merge(eval_df)

⚠️ Still missing lat/lng for: ['Atlanta-Sandy Springs-Roswell' 'Pittsburgh' 'Boston-Cambridge-Newton'
 'Tampa-St. Petersburg-Clearwater' 'Baltimore-Columbia-Towson'
 'Portland-Vancouver-Hillsboro' 'Philadelphia-Camden-Wilmington'
 'New York-Newark-Jersey City' 'Chicago-Naperville-Elgin'
 'Orlando-Kissimmee-Sanford' 'Seattle-Tacoma-Bellevue'
 'San Francisco-Oakland-Fremont' 'San Diego-Chula Vista-Carlsbad'
 'Austin-Round Rock-San Marcos' 'St. Louis' 'Sacramento-Roseville-Folsom'
 'Phoenix-Mesa-Chandler' 'Riverside-San Bernardino-Ontario'
 'San Antonio-New Braunfels' 'Detroit-Warren-Dearborn' 'Cincinnati'
 'Houston-Pasadena-The Woodlands' 'Charlotte-Concord-Gastonia'
 'Denver-Aurora-Centennial' 'Los Angeles-Long Beach-Anaheim'
 'Washington-Arlington-Alexandria' 'Dallas-Fort Worth-Arlington'
 'Minneapolis-St. Paul-Bloomington' 'Las Vegas-Henderson-North Las Vegas'
 'Miami-Fort Lauderdale-West Palm Beach']
⚠️ Still missing lat/lng for: ['Chicago-Naperville-Elgin' 'Cincinnati' 'New York-New

In [None]:
metros.head(2)

Unnamed: 0,metro_fips,metro,metro_ascii,metro_full,county_name,county_fips,state_id,state_name,lat,lng,population
0,35620,New York,New York,"New York-Newark-Jersey City, NY-NJ",Suffolk,36103,NY,New York,40.7222,-74.0225,19498249
1,31080,Los Angeles,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles,6037,CA,California,34.2215,-118.1494,12799100


In [10]:
# ============================
# 3. Apply cleaning + merge to both train and eval
# ============================
train_df.head(2)

Unnamed: 0,date,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,median_dom,avg_sale_to_list,sold_above_list,off_market_in_two_weeks,city,zipcode,year,bank,bus,hospital,mall,park,restaurant,school,station,supermarket,Total Population,Median Age,Per Capita Income,Total Families Below Poverty,Total Housing Units,Median Rent,Median Home Value,Total Labor Force,Unemployed Population,Total School Age Population,Total School Enrollment,Median Commute Time,price,city_full,lat,lng
0,2012-03-31,46550.0,217450.0,31.813674,110.183666,14.0,23.0,44.0,64.0,59.5,0.943662,0.142857,0.043478,ATL,30002,2012,12.0,2.0,4.0,1.0,60.0,45.0,57.0,4.0,7.0,5811.0,36.3,33052.0,5811.0,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,200773.999557,Atlanta-Sandy Springs-Roswell,,
1,2012-03-31,200000.0,7500.0,104.931794,79.265873,1.0,1.0,1.0,2.0,290.0,0.909091,0.0,0.0,PGH,15469,2012,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2441.0,41.8,20241.0,2385.0,1108.0,641.0,94600.0,1171.0,52.0,2376.0,2376.0,1018.0,105863.681174,Pittsburgh,,
