In [2]:
import requests
import time
import csv
import re
import json
import pandas as pd
import random
import numpy as np

# Initialize an empty list to store the results
results = []
region_lookup = pd.read_csv('../data/region_to_area_name.csv')
us_regions = region_lookup[region_lookup['country'] == 'US']

In [3]:
# Initialize an empty list to store the data
data_list = []
max_listings = 10000 #number of search results to include, will return this number or less if there aren't that many
within_last_30 = 1 #1 means include listings from last 30 days, 0 means include from last 45 days

#415 in total
for index, row in us_regions.head(2).iterrows():  # Iterate through the first 10 rows in us_regions DataFrame
    time.sleep(random.randint(1, 3))  # Pause the loop for a random interval between 1 to 3 seconds to avoid overwhelming the server
    region_code = row['regionCode']
    region_name = row['region'] 
    code = row['code'] #region code for craigslist region, e.g. 656=Missoula
    #URL includes
    url = f"https://sapi.craigslist.org/web/v8/postings/search/full?CC=US&availabilityMode={within_last_30}&batch={code}-0-{max_listings}-0-0-1&lang=en&searchPath=apa"
    # Make the HTTP request
    response = requests.get(url)
    if response.status_code == 200:
        json_string = re.search(r'cl\.jsonp\(.*?,\s*(.*)\)', response.text).group(1) #extract everything inside parenthesis cl.jsonp()
        data = json.loads(json_string)
        listings = data['data']['items'] #each "item": is a dictionary of arrays, the first element of each array is a key
        for listing in listings:
            title = next((item[1] for item in listing if isinstance(item, list) and item[0] == 6), None) #6 is key for title section
            price = next((item[1] for item in listing if isinstance(item, list) and item[0] == 10), None) #10 is key for price section
            bedrooms = next((item[1] for item in listing if isinstance(item, list) and item[0] == 5), None) #5 is key for more info section, includes bedrooms and sqft
            
            # Extract square feet, assuming it follows the bedrooms value
            square_feet = next((item[2] for item in listing if isinstance(item, list) and item[0] == 5 and len(item) > 2), None)
            
            if isinstance(listing[4], str) and '~' in listing[4]: #sometimes lat/lon is missing but if its there this is its form
                _, latitude, longitude = listing[4].split('~')
            else:
                latitude, longitude = None, None
            
            # Append the extracted data to the list as a dictionary, including region and regionCode
            data_list.append({
                "title": title,
                "price": price,
                "bedrooms": bedrooms,
                "square_feet": square_feet,
                "latitude": latitude,
                "longitude": longitude,
                "region": region_name,  # Add region name to the dictionary
                "region_code": region_code  # Add region code to the dictionary
            })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data_list)

In [7]:

# Convert 'price' from a string to a float
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

# Now you can calculate 'price_per_bedroom' and 'price_per_sqft'
df['price_per_bedroom'] = df['price'] / df['bedrooms']
df['price_per_sqft'] = df['price'] / df['square_feet']

# Group the DataFrame by 'region' and calculate the median 'price_per_bedroom' and 'price_per_sqft' for each region
region_grouped_df = df.groupby('region').agg({
    'price_per_bedroom': 'median',
    'price_per_sqft': 'median'
}).reset_index()

# Rename columns for clarity
region_grouped_df.rename(columns={
    'price_per_bedroom': 'median_price_per_bedroom',
    'price_per_sqft': 'median_price_per_sqft'
}, inplace=True)


In [None]:
# Convert 'price' from a string to a float
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

# Now you can calculate 'price_per_bedroom' and 'price_per_sqft'
df['price_per_bedroom'] = df['price'] / df['bedrooms']
df['price_per_sqft'] = df['price'] / df['square_feet']


In [4]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon

# Assuming 'us_regions' is your DataFrame with regions
# And 'df' is your DataFrame with listings, including 'latitude', 'longitude', 'price_per_bedroom', and 'price_per_sqft'

# Step 1: Convert 'us_regions' DataFrame to GeoDataFrame
us_regions_gdf = gpd.GeoDataFrame(us_regions, geometry=gpd.points_from_xy(us_regions.lon, us_regions.lat))

# Step 2: Create circles with a 60-mile radius (approx. 0.87 degrees)
radius_in_degrees = 60 / 69  # Approximate conversion from miles to degrees
us_regions_gdf['geometry'] = us_regions_gdf.apply(lambda row: row['geometry'].buffer(radius_in_degrees), axis=1)


In [None]:

# Step 3: Calculate median prices for listings within each circle
# Convert 'df' to GeoDataFrame
listings_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))

# Initialize columns for median prices in 'us_regions_gdf'
us_regions_gdf['Median Price per Bedroom'] = pd.NA
us_regions_gdf['Median Price per Square Foot'] = pd.NA

for index, region in us_regions_gdf.iterrows():
    # Find listings within the current region's circle
    listings_in_region = listings_gdf[listings_gdf.within(region.geometry)]
    
    # Calculate median prices and assign them to the region
    us_regions_gdf.at[index, 'Median Price per Bedroom'] = listings_in_region['price_per_bedroom'].median()
    us_regions_gdf.at[index, 'Median Price per Square Foot'] = listings_in_region['price_per_sqft'].median()

# 'us_regions_gdf' now contains circles around each center point with associated median prices

In [5]:
us_regions_gdf

Unnamed: 0,code,region,areaId,city,country,lat,lon,postal,radius,regionCode,url,geometry
0,1,sfbay,1.0,SF bay area,US,37.500000,-122.250000,,60.0,CA,sfbay.craigslist.org,"POLYGON ((-121.38043 37.50000, -121.38462 37.4..."
1,2,seattle,2.0,seattle-tacoma,US,47.606400,-122.331001,,60.0,WA,seattle.craigslist.org,"POLYGON ((-121.46144 47.60640, -121.46562 47.5..."
2,3,newyork,3.0,new york city,US,40.714199,-74.006401,,60.0,NY,newyork.craigslist.org,"POLYGON ((-73.13684 40.71420, -73.14102 40.628..."
3,4,boston,4.0,boston,US,42.358299,-71.060303,,60.0,MA,boston.craigslist.org,"POLYGON ((-70.19074 42.35830, -70.19492 42.273..."
6,7,losangeles,7.0,los angeles,US,34.052200,-118.242996,,60.0,CA,losangeles.craigslist.org,"POLYGON ((-117.37343 34.05220, -117.37762 33.9..."
...,...,...,...,...,...,...,...,...,...,...,...,...
708,709,hanford,709.0,hanford-corcoran,US,36.327400,-119.646004,,60.0,CA,hanford.craigslist.org,"POLYGON ((-118.77644 36.32740, -118.78063 36.2..."
709,710,santamaria,710.0,"santa maria, CA",US,34.963799,-120.433296,,60.0,CA,santamaria.craigslist.org,"POLYGON ((-119.56373 34.96380, -119.56792 34.8..."
710,711,winchester,711.0,"winchester, VA",US,39.178299,-78.166603,,60.0,VA,winchester.craigslist.org,"POLYGON ((-77.29704 39.17830, -77.30122 39.093..."
711,712,swva,712.0,southwest VA,US,36.892803,-82.084351,,60.0,VA,swva.craigslist.org,"POLYGON ((-81.21479 36.89280, -81.21897 36.807..."


In [6]:
us_regions_gdf.to_csv('us_regions_with_median_prices.csv', index=False)


In [20]:
import folium

# Initialize a map
m = folium.Map(location=[0, 0], zoom_start=5)

# Function to add a GeoDataFrame to a folium map
def add_gdf_to_map(gdf, map_obj):
    for _, r in gdf.iterrows():
        # Simplify geometry for faster rendering
        sim_geo = r['geometry'].simplify(0.005, preserve_topology=True)
        geo_j = folium.GeoJson(data=sim_geo,
                               style_function=lambda x: {'fillColor': 'orange', 'color': 'orange'})
        folium.Popup(f"Region: {r['region']}\nMedian Price per Bedroom: {r['Median Price per Bedroom']}\nMedian Price per Square Foot: {r['Median Price per Square Foot']}").add_to(geo_j)
        geo_j.add_to(map_obj)

# Add the GeoDataFrame to the map
add_gdf_to_map(us_regions_gdf, m)

# Display the map
m
