In [None]:
import requests
import folium
from IPython.display import display

# API Key removed
# API_KEY = ""

# Create map centered on Austin
m = folium.Map(location=[30.2672, -97.7431], zoom_start=12)
m.add_child(folium.LatLngPopup())

display(m)
print("Click on the map to get coordinates, then use them below:")

# Get location from user
lat = float(input("Enter latitude: "))
lng = float(input("Enter longitude: "))

# Categories to search
categories = [
    'grocery_or_supermarket',
    'pharmacy',
    'hospital',
    'doctor',
    'dentist',
    'veterinary_care',
    'restaurant',
    'cafe',
    'bar',
    'gym',
    'park',
    'spa',
    'shopping_mall',
    'convenience_store',
    'transit_station',
    'bus_station',
    'subway_station',
    'gas_station',
    'movie_theater',
    'museum',
    'library',
    'night_club',
    'school',
    'university',
    'bank',
    'atm',
    'post_office',
    'laundry',
    'police',
    'fire_station'
]

# Calculate 4 points in a square pattern around center
offset_miles = 5.0
lat_offset = offset_miles / 69.0
lng_offset = offset_miles / 54.6

search_points = [
    (lat + lat_offset, lng + lng_offset),  # Northeast
    (lat + lat_offset, lng - lng_offset),  # Northwest
    (lat - lat_offset, lng + lng_offset),  # Southeast
    (lat - lat_offset, lng - lng_offset),  # Southwest
]

url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
all_results = {}

print(f"\nSearching {len(categories)} categories across 4 locations...")
print("This may take a minute...\n")

for category in categories:
    category_count = 0
    for search_lat, search_lng in search_points:
        params = {
            "location": f"{search_lat},{search_lng}",
            "radius": 8000,  # 5 miles per search point
            "type": category,
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        data = response.json()

        if data['status'] == 'OK':
            for place in data['results']:
                place_id = place['place_id']
                if place_id not in all_results:
                    all_results[place_id] = place
                    category_count += 1

    print(f"âœ“ {category}: {category_count} new places found")

# Convert back to list
data = {'results': list(all_results.values())}

print(f"\n{'='*60}")
print(f"Total unique places found: {len(data['results'])}")
print(f"{'='*60}")
print(data)

In [None]:
for place in data['results']:
    name = place['name']
    rating = place.get('rating', 'N/A')

    # Get coordinates
    lat = place['geometry']['location']['lat']
    lng = place['geometry']['location']['lng']

    # Get categories/types (Google returns a list of types)
    categories = ', '.join(place.get('types', []))

    print(f"{name}")
    print(f"  Rating: {rating}")
    print(f"  Coordinates: ({lat}, {lng})")
    print(f"  Categories: {categories}")
    print("-" * 50)

In [None]:
# This cell does the grouping for neighborhood level average sentiment with the reddit posts attatched.

import pandas as pd

apt_df = pd.read_csv('/content/apartments_with_neighborhood.csv')
reddit_df = pd.read_csv('/content/reddit_posts_with_neighborhoods (1).csv')

reddit_df.head()
rgrouped = reddit_df.groupby('neighborhood_assigned')['sentiment_score'].mean().reset_index()
rgrouped.head()

reddit_df = pd.merge(reddit_df, rgrouped, on='neighborhood_assigned', how='left')
reddit_df = reddit_df.rename(columns={'sentiment_score_y': 'Average_Sentiment'})
reddit_df.head()

# Join apartment data with neighborhood sentiment scores
apt_df = pd.merge(
    apt_df,
    rgrouped,
    left_on='neighborhood_group',
    right_on='neighborhood_assigned',
    how='left'
)

# Rename the sentiment column to be clearer
apt_df = apt_df.rename(columns={'sentiment_score': 'sentiment_score'})

# Drop the duplicate neighborhood column if you don't need it
apt_df = apt_df.drop(columns=['neighborhood_assigned'], errors='ignore')

# Normalize sentiment scores to 0-100 scale
# Assuming sentiment_score is between -1 and 1 (adjust if different)
if 'sentiment_score' in apt_df.columns:
    apt_df['sentiment_score'] = ((apt_df['sentiment_score'] + 1) / 2) * 100
    apt_df['sentiment_score'] = apt_df['sentiment_score'].fillna(50)  # Fill missing with neutral

print(f"Apartments with sentiment scores: {apt_df['sentiment_score'].notna().sum()}/{len(apt_df)}")
apt_df.head()

In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

# Initialize geocoder
geolocator = Nominatim(user_agent="apartment_geocoder")

def geocode_address(address):
    """Geocode an address and return coordinates as a tuple"""
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return (location.latitude, location.longitude)
        else:
            return None
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Error geocoding {address}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error for {address}: {e}")
        return None

# Apply geocoding to the dataframe
# Add a small delay to respect rate limits (Nominatim requires 1 second between requests)
coords_list = []
for idx, address in enumerate(apt_df['address']):
    print(f"Geocoding {idx+1}/{len(apt_df)}: {address}")
    coords = geocode_address(address)
    coords_list.append(coords)
    time.sleep(1)  # Required for Nominatim - 1 request per second

apt_df['coords'] = coords_list

# Optional: Split into separate lat/lon columns
apt_df['latitude'] = apt_df['coords'].apply(lambda x: x[0] if x else None)
apt_df['longitude'] = apt_df['coords'].apply(lambda x: x[1] if x else None)

print(apt_df[['address', 'coords', 'latitude', 'longitude']].head())


# Remove records where the address couldnt fill in

apt_df = apt_df.dropna(subset=['latitude', 'longitude'])
apt_df.head()

In [None]:
# Filter for distance to apartment
# 5 Mile threshold

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in miles between two points
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    miles = 3956 * c  # Radius of earth in miles
    return miles

# Create new dataframe with only nearby apartments
nearby_apartments = apt_df.copy()

# Calculate distance for each apartment
nearby_apartments['distance_miles'] = nearby_apartments.apply(
    lambda row: haversine(row['longitude'], row['latitude'], lng, lat)
    if pd.notna(row['latitude']) and pd.notna(row['longitude'])
    else None,
    axis=1
)

# Filter to only apartments within 5 miles
nearby_apartments = nearby_apartments[nearby_apartments['distance_miles'] <= 5.0]

# Sort by distance
nearby_apartments = nearby_apartments.sort_values('distance_miles').reset_index(drop=True)

print(f"\nFound {len(nearby_apartments)} apartments within 5 miles")
print(nearby_apartments[['address', 'distance_miles']])

# Original apt_df is unchanged

Claude Categories:
Essential Services:

grocery_or_supermarket - Grocery stores
pharmacy - Pharmacies
hospital - Hospitals/Emergency care
doctor - Medical clinics
dentist - Dental offices
veterinary_care - Vet clinics (if you have pets)

Food & Social:

restaurant - Restaurants
cafe - Coffee shops
bar - Bars/Pubs

Fitness & Wellness:

gym - Gyms/Fitness centers
park - Parks & green spaces
spa - Spas/Wellness centers

Shopping:

shopping_mall - Shopping centers
grocery_or_supermarket - (already listed above)
convenience_store - Quick shopping

Transportation:

transit_station - Public transit stops
bus_station - Bus stops
subway_station - Metro/Subway
gas_station - Gas stations

Entertainment & Culture:

movie_theater - Cinemas
museum - Museums
library - Libraries
night_club - Nightlife

Education (if relevant):

school - Schools
university - Colleges/Universities

Convenience:

bank - Banks
atm - ATMs
post_office - Post offices
laundry - Laundromats

Safety:

police - Police stations
fire_station - Fire stations

In [None]:
# scoring system.

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a pretrained sentence transformer model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

# Google Places categories from your API call
google_categories = [
    'grocery_or_supermarket', 'pharmacy', 'hospital', 'doctor', 'dentist',
    'veterinary_care', 'restaurant', 'cafe', 'bar', 'gym', 'park', 'spa',
    'shopping_mall', 'convenience_store', 'transit_station', 'bus_station',
    'subway_station', 'gas_station', 'movie_theater', 'museum', 'library',
    'night_club', 'school', 'university', 'bank', 'atm', 'post_office',
    'laundry', 'police', 'fire_station'
]

def match_user_input_to_categories(user_input, categories,
                                   low_threshold=0.5,
                                   high_threshold=0.7,
                                   super_threshold=0.85):
    """
    Match user input to Google Places categories using semantic similarity.

    Returns:
        list of tuples: (category, confidence_score, bonus_multiplier)
    """
    # Encode user input and categories
    user_embedding = model.encode([user_input])
    category_embeddings = model.encode(categories)

    # Calculate cosine similarity
    similarities = cosine_similarity(user_embedding, category_embeddings)[0]

    # Filter and assign bonuses based on confidence
    matches = []
    for category, score in zip(categories, similarities):
        if score < low_threshold:
            continue  # Ignore low confidence
        elif score >= super_threshold:
            matches.append((category, score, 1.5))  # Super confident bonus
        elif score >= high_threshold:
            matches.append((category, score, 1.2))  # High confident bonus
        else:
            matches.append((category, score, 1.0))  # Regular match

    return sorted(matches, key=lambda x: x[1], reverse=True)


def calculate_amenity_score(apt_coords, places_data, user_input, matched_categories,
                            max_distance_miles=5.0,
                            category_weight=0.7,
                            name_weight=0.3):
    """
    Calculate amenity score based on proximity to matched places.
    Uses BOTH category matching AND name similarity for better classification.

    Args:
        apt_coords: (lat, lng) tuple for apartment
        places_data: dict with 'results' containing Google Places results
        user_input: original user query (e.g., "burger", "italian food")
        matched_categories: list of (category, confidence, bonus) tuples
        max_distance_miles: maximum distance to consider
        category_weight: how much category match matters (default 0.4)
        name_weight: how much name match matters (default 0.6)

    Returns:
        float: amenity score (0-100)
    """
    from math import radians, cos, sin, asin, sqrt

    def haversine(lon1, lat1, lon2, lat2):
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a))
        return 3956 * c  # miles

    if not matched_categories:
        return 0

    apt_lat, apt_lng = apt_coords

    # Encode user input once for name comparisons
    user_embedding = model.encode([user_input])

    # Get all matching places
    matching_places = []
    for place in places_data['results']:
        place_types = place.get('types', [])
        place_name = place.get('name', '')

        # Check if place matches any of our categories
        category_match = None
        for category, confidence, bonus in matched_categories:
            if category in place_types:
                category_match = (confidence, bonus)
                break

        if category_match:
            # Calculate name similarity
            name_embedding = model.encode([place_name])
            name_similarity = cosine_similarity(user_embedding, name_embedding)[0][0]

            # Combined confidence score:
            # Mix category confidence and name similarity
            category_confidence, category_bonus = category_match
            combined_confidence = (
                category_confidence * category_weight +
                name_similarity * name_weight
            )

            # Bonus multiplier: higher if both name and category match well
            if name_similarity > 0.7 and category_confidence > 0.7:
                final_bonus = category_bonus * 1.5  # Both match really well
            elif name_similarity > 0.5:
                final_bonus = category_bonus * 1.2  # Name matches decently
            else:
                final_bonus = category_bonus  # Just category match

            place_lat = place['geometry']['location']['lat']
            place_lng = place['geometry']['location']['lng']
            distance = haversine(apt_lng, apt_lat, place_lng, place_lat)

            if distance <= max_distance_miles:
                matching_places.append({
                    'name': place_name,
                    'distance': distance,
                    'combined_confidence': combined_confidence,
                    'name_similarity': name_similarity,
                    'category_confidence': category_confidence,
                    'bonus': final_bonus
                })

    if not matching_places:
        return 0

    # Sort by combined score (confidence * distance proximity)
    for place in matching_places:
        distance_score = (max_distance_miles - place['distance']) / max_distance_miles * 100
        place['weighted_score'] = distance_score * place['bonus'] * place['combined_confidence']

    matching_places.sort(key=lambda x: x['weighted_score'], reverse=True)
    top_places = matching_places[:3]

    # Calculate final score
    total_score = 0
    for place in top_places:
        total_score += place['weighted_score']

    # Average and normalize
    amenity_score = total_score / len(top_places)
    return min(amenity_score, 100)  # Cap at 100


def calculate_final_apartment_score(apt_row, places_data, user_amenities,
                                    sentiment_weight=0.4,
                                    distance_weight=0.3,
                                    amenity_weight=0.3,
                                    max_distance_miles=5.0):
    """
    Calculate final apartment score combining sentiment, distance, and amenities.

    Args:
        apt_row: DataFrame row with apartment data
        places_data: Google Places API results
        user_amenities: list of amenities user cares about (e.g., ['elementary schools', 'parks'])
        sentiment_weight: weight for sentiment score
        distance_weight: weight for distance score
        amenity_weight: weight for amenities score
        max_distance_miles: max distance for scoring

    Returns:
        dict with breakdown of scores
    """
    # 1. Sentiment score (assume already 0-100)
    sentiment_score = apt_row.get('sentiment_score', 50)  # default to 50 if missing

    # 2. Distance score
    distance = apt_row['distance_miles']
    distance_score = (max_distance_miles - distance) / max_distance_miles * 100

    # 3. Amenity scores
    apt_coords = apt_row['coords']
    amenity_scores = []

    for amenity in user_amenities:
        # Match user amenity to Google categories
        matches = match_user_input_to_categories(amenity, google_categories)

        if matches:
            # Calculate score for this amenity (now includes user_input for name matching)
            score = calculate_amenity_score(apt_coords, places_data, amenity, matches, max_distance_miles)
            amenity_scores.append(score)

    # Average amenity scores
    avg_amenity_score = np.mean(amenity_scores) if amenity_scores else 0

    # 4. Calculate weighted final score
    final_score = (
        sentiment_score * sentiment_weight +
        distance_score * distance_weight +
        avg_amenity_score * amenity_weight
    )

    return {
        'final_score': final_score,
        'sentiment_score': sentiment_score,
        'distance_score': distance_score,
        'amenity_score': avg_amenity_score,
        'individual_amenity_scores': amenity_scores
    }


# Example usage:

# User inputs what they care about
user_amenities = ['College', 'Thrift Shop', 'Nightclub']

# Score all apartments
scored_apartments = []
for idx, apt in nearby_apartments.iterrows():
    scores = calculate_final_apartment_score(
        apt,
        data,  # Google Places data from your API call
        user_amenities,
        sentiment_weight=0.4,
        distance_weight=0.3,
        amenity_weight=0.3
    )

    scored_apartments.append({
        'address': apt['address'],
        **scores
    })

# Convert to DataFrame and sort
results_df = pd.DataFrame(scored_apartments)
results_df = results_df.sort_values('final_score', ascending=False)

print(results_df)

In [None]:
# Example usage:

# User inputs what they care about
user_amenities = ['Elementary School', 'pediatrician', 'Grocery Store']

# Score all apartments
scored_apartments = []
for idx, apt in nearby_apartments.iterrows():
    scores = calculate_final_apartment_score(
        apt,
        data,  # Google Places data from your API call
        user_amenities,
        sentiment_weight=0.4,
        distance_weight=0.3,
        amenity_weight=0.3
    )

    scored_apartments.append({
        'address': apt['address'],
        **scores
    })

# Convert to DataFrame and sort
results_df = pd.DataFrame(scored_apartments)
results_df = results_df.sort_values('final_score', ascending=False)

print(results_df)

In [None]:

# Example usage:

# User inputs what they care about
user_amenities = ['College', 'Thrift Shop', 'Nightclub']

# Score all apartments
scored_apartments = []
for idx, apt in nearby_apartments.iterrows():
    scores = calculate_final_apartment_score(
        apt,
        data,  # Google Places data from your API call
        user_amenities,
        sentiment_weight=0.4,
        distance_weight=0.3,
        amenity_weight=0.3
    )

    scored_apartments.append({
        'address': apt['address'],
        **scores
    })

# Convert to DataFrame and sort
college_results_df = pd.DataFrame(scored_apartments)
college_results_df = college_results_df.sort_values('final_score', ascending=False)

print(results_df)

In [None]:
college_results_df.to_csv('college_results.csv', index=False)
results_df.to_csv('results.csv', index=False)
print('done')

In [None]:
reddit_df.groupby('neighborhood_assigned')['Average_Sentiment'].mean()