In [1]:
import pandas as pd
import numpy as np
import googlemaps
from datetime import datetime
from haversine import haversine
import os

In [2]:
daily_refresh_file_path = '../Airflow/mnt/airflow/dags/files/raw_daily_craigslist_listings.csv'

In [3]:
df = pd.read_csv(daily_refresh_file_path)

In [4]:
df.head()

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address,monthly,apartment,cats are OK - purrr,dogs are OK - wooof,laundry on site,...,laundry in bldg,Fee Needed To Apply,wheelchair accessible,no parking,furnished,street parking,no laundry on site,house,w/d hookups,date_added
0,Renovated 1BR Apartment in West LA - Hardwood ...,"$2,515",1br,668.0,None listed,1,1,1,1,0,...,1,0,0,0,0,0,0,0,0,2024-05-17
1,"Situated in Santa Monica!, 1/BD 1/BA, Hardwood...","$2,471",1br,382.0,"1447 Lincoln Blvd, Santa Monica, CA 90401",1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,2024-05-17
2,Beautiful One Bedroom Apartment - The Lifestyl...,"$3,405",1br,870.0,"550 South Barrington Avenue, Los Angeles, CA 9...",1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2024-05-17
3,"Magnificent luxury building. 1 bed, 1 ba, 496 ...","$2,895",1br,496.0,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,1,...,0,1,1,0,0,0,0,0,0,2024-05-17
4,Magnificent amenities. Great location. Secure ...,"$3,255",1br,516.0,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,0,...,0,1,1,0,0,0,0,0,0,2024-05-17


In [5]:
# Use the API key associated with my account
google_api_key = 'AIzaSyA3T4JqlZdlnZIqTUkbGulILleHQeRIP6A'

In [6]:
# Set up the API key in Google Maps
gmaps = googlemaps.Client(key=google_api_key)

In [7]:
# Loops through the Craigslist data and creates latitudes and longitudes.
def geocode_address(address):
    # Geocode the address using Google Maps API
    geocode_result = gmaps.geocode(address)

    if geocode_result:
        lat = geocode_result[0]['geometry']['location']['lat']
        long = geocode_result[0]['geometry']['location']['lng']
        return lat, long
    else:
        return None, None # Return none if geocoding fails

In [8]:
df['latitude'], df['longitude'] = zip(*df['Full Address'].map(geocode_address))

In [9]:
# Checking the dataframe to view a sample of what we pulled
df[['Full Address', 'latitude', 'longitude']].head()

Unnamed: 0,Full Address,latitude,longitude
0,None listed,,
1,"1447 Lincoln Blvd, Santa Monica, CA 90401",34.0184,-118.489892
2,"550 South Barrington Avenue, Los Angeles, CA 9...",34.0584,-118.46808
3,"3644 Overland Ave, Los Angeles, CA 90034",34.022266,-118.409335
4,"3644 Overland Ave, Los Angeles, CA 90034",34.022266,-118.409335


In [10]:
# Convert the radius from miles to meters
def miles_to_meters(miles):
    return miles * 1609.34

In [11]:
# Santa Monica's latitude and longitude
santa_monica_lat = 34.0259
santa_monica_lng = -118.4965
santa_monica_location = (santa_monica_lat, santa_monica_lng)
default_radius = miles_to_meters(3.6) # 3.6 mile radius within Santa Monica 

In [12]:
# Define the function to find stores
def find_stores(store_list, store_address_info):

    for the_store in store_list:
    
        # Perform a nearby search for stores around Santa Monica within a 3.6-mile radius
        results = gmaps.places_nearby(location=santa_monica_location, 
                                      keyword=the_store, 
                                      radius=default_radius)
        
        # Extracting and storing the names, addresses, and coordinates in the grocery_stores_info list
        for place in results['results']:
            store_info = {
                'Name': place['name'],
                'Address': place.get('vicinity', 'Address not provided'),
                'Latitude': place['geometry']['location']['lat'],
                'Longitude': place['geometry']['location']['lng']
            }
            
            # Append store information to the list
            store_address_info.append(store_info)

In [13]:
# Initialize an empty list to store grocery store information
premium_grocery_stores_address_info = []

# List of grocery stores to be included
search_premium_grocery_stores = ['Whole Foods Market', 'Erewhon', 'Bristol Farms']

# Call the function 
find_stores(search_premium_grocery_stores, premium_grocery_stores_address_info)

# Print the list of stores to verify
for store in premium_grocery_stores_address_info:
    print(store)

{'Name': 'Whole Foods Market', 'Address': '11666 National Blvd, Los Angeles', 'Latitude': 34.0224904, 'Longitude': -118.4379608}
{'Name': 'Whole Foods Market', 'Address': '2121 Cloverfield Blvd, Santa Monica', 'Latitude': 34.0220438, 'Longitude': -118.4656809}
{'Name': 'Whole Foods Market', 'Address': '225 Lincoln Blvd, Venice', 'Latitude': 34.0011613, 'Longitude': -118.4698813}
{'Name': 'Whole Foods Market', 'Address': '1050 Gayley Ave, Los Angeles', 'Latitude': 34.0611873, 'Longitude': -118.4469309}
{'Name': 'Whole Foods Market', 'Address': '2201 Wilshire Blvd, Santa Monica', 'Latitude': 34.0332265, 'Longitude': -118.4812706}
{'Name': 'Whole Foods Market', 'Address': '11737 San Vicente Blvd, Los Angeles', 'Latitude': 34.0536284, 'Longitude': -118.4673501}
{'Name': 'Whole Foods Market', 'Address': '1425 Montana Ave, Santa Monica', 'Latitude': 34.0328379, 'Longitude': -118.4946332}
{'Name': 'Erewhon', 'Address': '585 Venice Blvd., Venice', 'Latitude': 33.9897008, 'Longitude': -118.4620

In [14]:
# Initialize an empty list to store grocery store information
midTier_grocery_stores_address_info = []

# List of grocery stores to be included
search_midTier_grocery_stores = ['Ralphs Fresh Fare', 'Vons', 'Trader Joe\'s']

# Call the function 
find_stores(search_midTier_grocery_stores, midTier_grocery_stores_address_info)

# Print the list of stores to verify
for store in midTier_grocery_stores_address_info:
    print(store)

{'Name': 'Ralphs Fresh Fare', 'Address': '1644 Cloverfield Blvd, Santa Monica', 'Latitude': 34.0270416, 'Longitude': -118.4738669}
{'Name': 'Ralphs Fresh Fare', 'Address': '11727 W Olympic Blvd, Los Angeles', 'Latitude': 34.0349195, 'Longitude': -118.4490712}
{'Name': 'Ralphs Fresh Fare', 'Address': '4700 Admiralty Way, Marina Del Rey', 'Latitude': 33.9798932, 'Longitude': -118.4389794}
{'Name': 'Ralphs Fresh Fare', 'Address': '4311 Lincoln Blvd, Marina Del Rey', 'Latitude': 33.9842533, 'Longitude': -118.4432258}
{'Name': 'Ralphs Fresh Fare', 'Address': '10861 Weyburn Ave, Los Angeles', 'Latitude': 34.0629321, 'Longitude': -118.443508}
{'Name': 'Ralphs Fresh Fare', 'Address': '12057 Wilshire Blvd, Los Angeles', 'Latitude': 34.04497, 'Longitude': -118.4670728}
{'Name': 'Ralphs Fresh Fare', 'Address': '15120 Sunset Blvd, Pacific Palisades', 'Latitude': 34.0450033, 'Longitude': -118.5241476}
{'Name': 'Ralphs', 'Address': '910 Lincoln Blvd, Venice', 'Latitude': 33.999184, 'Longitude': -118

In [15]:
# Initialize an empty list to store grocery store information
budget_grocery_stores_address_info = []

# List of grocery stores to be included
search_budget_grocery_stores = ['Costco Wholesale', 'Smart and Final']

# Call the function 
find_stores(search_budget_grocery_stores, budget_grocery_stores_address_info)

# Print the list of stores to verify
for store in budget_grocery_stores_address_info:
    print(store)

{'Name': 'Costco Wholesale', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.9927494, 'Longitude': -118.4464242}
{'Name': 'Costco Gas Station', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.9920815, 'Longitude': -118.445199}
{'Name': 'Costco Pharmacy', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.9930324, 'Longitude': -118.4469763}
{'Name': 'Costco Food Court - Marina Del Rey', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.99250300000001, 'Longitude': -118.4467895}
{'Name': 'Costco Bakery', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.9927736, 'Longitude': -118.4469117}
{'Name': 'Costco Tire Center', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.9930264, 'Longitude': -118.4463365}
{'Name': 'Costco Optical Center in Marina Del Rey', 'Address': '13463 Washington Blvd, Marina Del Rey', 'Latitude': 33.993108, 'Longitude': -118.44656}
{'Name': 'Smart & Final', 'Addre

In [16]:
# Define a function to calculate haversine distance
def haversine_distance(coord1, coord2):
    return haversine(coord1, coord2, unit='mi')  # Returns distance in miles

In [17]:
# function to find the nearest premium grocery store for each listing and calculate the distance
def find_nearest_grocery_store(listing_lat, listing_lng, grocery_store_list):

    # Check if coordinates are n/a before continuing
    if pd.isna(listing_lat) or pd.isna(listing_lng):
        return None, "N/A"

    else: 
        # Initialize min_distance to be None
        min_distance = None
    
        # Initialize the nearest store
        nearest_store = None
        
        # Loop over each premium grocery store
        for store in grocery_store_list:
            store_coord = (store['Latitude'], store['Longitude'])
            listing_coord = (listing_lat, listing_lng)
            
            # Calculate the distance
            distance = haversine_distance(listing_coord, store_coord)
            
            # Update minimum distance if it's lower than the current minimum
            # Collect the nearest store's information to put into its own column
            if min_distance is None or distance < min_distance:
                min_distance = distance
                nearest_grocery_store = f"{store['Name']} - {store['Address']}"
        
        # Return the minimum distance
        return min_distance, nearest_grocery_store

In [18]:
def add_store_distances_to_dataframe (df):

    # Dictionary to hold the types of stores and their respective info lists
    store_types = {
        'budget': budget_grocery_stores_address_info,
        'midTier': midTier_grocery_stores_address_info,
        'premium': premium_grocery_stores_address_info
    }

    # Loop through each store type and calculate the nearest store and distance
    for store_type, stores_info in store_types.items():
        distance_col_name = f'nearest_{store_type}_grocery_store_distance'
        store_col_name = f'nearest_{store_type}_grocery_store'
        # Apply the find_nearest_grocery_store function and assign the results
        df[[distance_col_name, store_col_name]] = pd.DataFrame(
            df.apply(
                lambda row: find_nearest_grocery_store(row['latitude'], row['longitude'], stores_info), 
                axis=1).tolist(), index=df.index)
    return df

In [19]:
df = add_store_distances_to_dataframe(df)

In [20]:
df.head()

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address,monthly,apartment,cats are OK - purrr,dogs are OK - wooof,laundry on site,...,w/d hookups,date_added,latitude,longitude,nearest_budget_grocery_store_distance,nearest_budget_grocery_store,nearest_midTier_grocery_store_distance,nearest_midTier_grocery_store,nearest_premium_grocery_store_distance,nearest_premium_grocery_store
0,Renovated 1BR Apartment in West LA - Hardwood ...,"$2,515",1br,668.0,None listed,1,1,1,1,0,...,0,2024-05-17,,,,,,,,
1,"Situated in Santa Monica!, 1/BD 1/BA, Hardwood...","$2,471",1br,382.0,"1447 Lincoln Blvd, Santa Monica, CA 90401",1,1,1,0,1,...,0,2024-05-17,34.0184,-118.489892,1.915445,"Smart & Final Extra! - 604 Lincoln Blvd, Venice",0.239695,"Trader Joe's - 500 Broadway, Santa Monica",1.033873,"Whole Foods Market - 1425 Montana Ave, Santa M..."
2,Beautiful One Bedroom Apartment - The Lifestyl...,"$3,405",1br,870.0,"550 South Barrington Avenue, Los Angeles, CA 9...",1,1,1,1,1,...,0,2024-05-17,34.0584,-118.46808,1.301928,"Smart & Final - 12210 Santa Monica Blvd W, Los...",0.929685,"Ralphs Fresh Fare - 12057 Wilshire Blvd, Los A...",0.332292,"Whole Foods Market - 11737 San Vicente Blvd, L..."
3,"Magnificent luxury building. 1 bed, 1 ba, 496 ...","$2,895",1br,496.0,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,1,...,0,2024-05-17,34.022266,-118.409335,1.898845,"Smart & Final Extra! - 11221 W Pico Blvd, Los ...",1.39793,"Ralphs - 11361 National Blvd, Los Angeles",1.639368,"Whole Foods Market - 11666 National Blvd, Los ..."
4,Magnificent amenities. Great location. Secure ...,"$3,255",1br,516.0,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,0,...,0,2024-05-17,34.022266,-118.409335,1.898845,"Smart & Final Extra! - 11221 W Pico Blvd, Los ...",1.39793,"Ralphs - 11361 National Blvd, Los Angeles",1.639368,"Whole Foods Market - 11666 National Blvd, Los ..."


In [21]:
refresh_export_path = '../Airflow/mnt/airflow/dags/files/Daily_Refresh_With_Locations_Data.csv'

In [22]:
df.to_csv(refresh_export_path, index=False)