**5.1. Building a chatbot with LangChain**

In [2]:
!pip install -U -q llama-index llama-index-llms-groq


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.groq import Groq

In [None]:
os.environ["GROQ_API_KEY"] = "your_groq_api_key"

In [6]:
llm = Groq(
    model="llama-3.3-70b-versatile",
    temperature=0
)

In [7]:
def chat():
    history = [
        ChatMessage(role=MessageRole.SYSTEM, content="You are a helpful assistant. Be concise and accurate.")
    ]

    print("LlamaIndex Chatbot. Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            break

        # add user message
        history.append(ChatMessage(role=MessageRole.USER, content=user_input))

        # call LLM with full history
        resp = llm.chat(messages=history)  # ChatResponse
        answer = resp.message.content

        print(f"Bot: {answer}\n")

        # add assistant message
        history.append(ChatMessage(role=MessageRole.ASSISTANT, content=answer))

        print("-"*80)


In [9]:
chat()

LlamaIndex Chatbot. Type 'exit' to quit.



You:  What is Deep Learning?


Bot: Deep Learning is a subset of Machine Learning that uses neural networks with multiple layers to analyze and interpret data. It's inspired by the human brain's structure and function, and is particularly useful for tasks like image recognition, speech recognition, and natural language processing.

--------------------------------------------------------------------------------


You:  Give some applications.


Bot: Some applications of Deep Learning include:

1. **Image Recognition**: Self-driving cars, facial recognition, object detection.
2. **Speech Recognition**: Virtual assistants (e.g. Siri, Alexa), voice-to-text systems.
3. **Natural Language Processing**: Language translation, text summarization, sentiment analysis.
4. **Medical Diagnosis**: Disease detection, medical image analysis, personalized medicine.
5. **Autonomous Systems**: Robotics, drones, autonomous vehicles.

--------------------------------------------------------------------------------


You:  quit


In [2]:
"""
Temporary Location Anchor Manager for Restaurant Ranking
Handles cold start, multi-location users, and privacy-preserving location inference
"""

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict
import json

# ============================================================================
# CORE DATA STRUCTURES
# ============================================================================

@dataclass
class LocationAnchor:
    """Represents a temporary location anchor for a user"""
    anchor_id: str
    user_id: str
    centroid_lat: float
    centroid_lon: float
    earliest_visit: datetime
    latest_visit: datetime
    visit_count: int
    spatial_std: float  # Standard deviation of locations (if merged)
    source_visits: List[Tuple[str, float, float, datetime]]  # (business_id, lat, lon, timestamp)
    
    def to_dict(self):
        return {
            'anchor_id': self.anchor_id,
            'user_id': self.user_id,
            'centroid_lat': self.centroid_lat,
            'centroid_lon': self.centroid_lon,
            'earliest_visit': self.earliest_visit.isoformat(),
            'latest_visit': self.latest_visit.isoformat(),
            'visit_count': self.visit_count,
            'spatial_std': self.spatial_std,
            'source_visits': [(bid, lat, lon, ts.isoformat()) 
                             for bid, lat, lon, ts in self.source_visits]
        }
    
    @classmethod
    def from_dict(cls, data):
        data['earliest_visit'] = datetime.fromisoformat(data['earliest_visit'])
        data['latest_visit'] = datetime.fromisoformat(data['latest_visit'])
        data['source_visits'] = [(bid, lat, lon, datetime.fromisoformat(ts)) 
                                 for bid, lat, lon, ts in data['source_visits']]
        return cls(**data)


@dataclass
class RankingContext:
    """Context for restaurant ranking"""
    mode: str  # 'global_popular', 'local_explore', 'personalized'
    signal_strength: str  # 'no_history', 'single_visit', 'established'
    anchors: List[Tuple[LocationAnchor, float]]  # (anchor, strength_score)
    primary_radius_km: float
    extended_radius_km: float
    min_personalization_score: float  # For extended radius


# ============================================================================
# DISTANCE UTILITIES
# ============================================================================

def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calculate distance between two points on Earth in kilometers
    """
    R = 6371  # Earth radius in km
    
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)
    delta_lat = np.radians(lat2 - lat1)
    delta_lon = np.radians(lon2 - lon1)
    
    a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return R * c


def calculate_spatial_std(locations: List[Tuple[float, float]]) -> float:
    """
    Calculate spatial standard deviation of a cluster of locations
    """
    if len(locations) <= 1:
        return 0.0
    
    lats = [loc[0] for loc in locations]
    lons = [loc[1] for loc in locations]
    
    # Convert to km using approximation
    lat_std = np.std(lats) * 111  # 1 degree lat â‰ˆ 111 km
    lon_std = np.std(lons) * 111 * np.cos(np.radians(np.mean(lats)))
    
    return np.sqrt(lat_std**2 + lon_std**2)


# ============================================================================
# ANCHOR STRENGTH CALCULATION
# ============================================================================

def calculate_anchor_strength(
    days_since_last_visit: float,
    visit_count: int,
    spatial_tightness: float,
    half_life_days: float = 90.0
) -> float:
    """
    Calculate strength score for an anchor
    
    Args:
        days_since_last_visit: Days since the most recent visit
        visit_count: Number of visits contributing to this anchor
        spatial_tightness: Spatial std dev in km (lower = tighter cluster)
        half_life_days: Time for anchor to decay to 50% strength
    
    Returns:
        Strength score (0 to ~10, higher = stronger)
    """
    # Exponential time decay
    time_weight = np.exp(-days_since_last_visit / half_life_days)
    
    # Visit frequency boost (log scale to prevent outliers)
    frequency_weight = np.log1p(visit_count)
    
    # Spatial confidence (penalize loose clusters)
    spatial_weight = 1.0 / (1.0 + spatial_tightness)
    
    return time_weight * frequency_weight * spatial_weight


# ============================================================================
# ANCHOR MERGING LOGIC
# ============================================================================

def should_merge_anchors(
    anchor1: LocationAnchor,
    anchor2: LocationAnchor,
    max_distance_km: float = 50.0,
    min_time_spread_days: int = 7,
    min_total_visits: int = 3
) -> bool:
    """
    Determine if two anchors should be merged
    
    Criteria:
    1. Geographic proximity (within max_distance_km)
    2. Temporal spread (not same-day burst)
    3. Minimum visit count (meaningful pattern)
    """
    # Calculate distance between centroids
    distance = haversine_distance(
        anchor1.centroid_lat, anchor1.centroid_lon,
        anchor2.centroid_lat, anchor2.centroid_lon
    )
    
    if distance > max_distance_km:
        return False
    
    # Check temporal spread
    all_visits = sorted([v[3] for v in anchor1.source_visits + anchor2.source_visits])
    time_spread = (all_visits[-1] - all_visits[0]).days
    
    if time_spread < min_time_spread_days:
        return False
    
    # Check total visits
    total_visits = anchor1.visit_count + anchor2.visit_count
    
    if total_visits < min_total_visits:
        return False
    
    return True


def merge_anchors(anchor1: LocationAnchor, anchor2: LocationAnchor) -> LocationAnchor:
    """
    Merge two anchors into a single anchor
    """
    import uuid
    
    combined_visits = anchor1.source_visits + anchor2.source_visits
    
    # Calculate new centroid (weighted by visit count per location)
    all_lats = [v[1] for v in combined_visits]
    all_lons = [v[2] for v in combined_visits]
    
    new_centroid_lat = np.mean(all_lats)
    new_centroid_lon = np.mean(all_lons)
    
    # Calculate spatial std
    locations = [(v[1], v[2]) for v in combined_visits]
    spatial_std = calculate_spatial_std(locations)
    
    # Time boundaries
    all_timestamps = [v[3] for v in combined_visits]
    earliest = min(all_timestamps)
    latest = max(all_timestamps)
    
    return LocationAnchor(
        anchor_id=str(uuid.uuid4()),
        user_id=anchor1.user_id,
        centroid_lat=new_centroid_lat,
        centroid_lon=new_centroid_lon,
        earliest_visit=earliest,
        latest_visit=latest,
        visit_count=len(combined_visits),
        spatial_std=spatial_std,
        source_visits=combined_visits
    )


# ============================================================================
# ANCHOR MANAGER - MAIN CLASS
# ============================================================================

class LocationAnchorManager:
    """
    Main class for managing user location anchors
    """
    
    def __init__(
        self,
        merging_distance_km: float = 50.0,
        primary_radius_km: float = 40.0,
        extended_radius_km: float = 100.0,
        anchor_max_age_days: int = 365,
        min_anchor_strength: float = 0.1
    ):
        self.merging_distance_km = merging_distance_km
        self.primary_radius_km = primary_radius_km
        self.extended_radius_km = extended_radius_km
        self.anchor_max_age_days = anchor_max_age_days
        self.min_anchor_strength = min_anchor_strength
        
        # Storage: user_id -> List[LocationAnchor]
        self.user_anchors: Dict[str, List[LocationAnchor]] = defaultdict(list)
    
    def build_anchors_from_dataframe(self, df: pd.DataFrame):
        """
        Build anchors from tips/reviews dataframe
        
        Expected columns:
        - user_id
        - business_id
        - latitude
        - longitude
        - tip_date or reviews_date
        """
        import uuid
        
        # Determine date column
        date_col = 'tip_date' if 'tip_date' in df.columns else 'reviews_date'
        
        # Sort by user and date
        df_sorted = df.sort_values(['user_id', date_col]).copy()
        
        print(f"Building anchors for {df_sorted['user_id'].nunique()} users...")
        
        for user_id, user_df in df_sorted.groupby('user_id'):
            user_anchors = []
            
            for _, row in user_df.iterrows():
                # Create initial anchor from each visit
                anchor = LocationAnchor(
                    anchor_id=str(uuid.uuid4()),
                    user_id=user_id,
                    centroid_lat=row['latitude'],
                    centroid_lon=row['longitude'],
                    earliest_visit=pd.to_datetime(row[date_col]),
                    latest_visit=pd.to_datetime(row[date_col]),
                    visit_count=1,
                    spatial_std=0.0,
                    source_visits=[(
                        row['business_id'],
                        row['latitude'],
                        row['longitude'],
                        pd.to_datetime(row[date_col])
                    )]
                )
                user_anchors.append(anchor)
            
            # Attempt to merge nearby anchors
            merged_anchors = self._merge_anchor_list(user_anchors)
            self.user_anchors[user_id] = merged_anchors
        
        print(f"Created {sum(len(anchors) for anchors in self.user_anchors.values())} total anchors")
        print(f"Average anchors per user: {np.mean([len(a) for a in self.user_anchors.values()]):.2f}")
    
    def _merge_anchor_list(self, anchors: List[LocationAnchor]) -> List[LocationAnchor]:
        """
        Iteratively merge anchors that meet merging criteria
        """
        if len(anchors) <= 1:
            return anchors
        
        merged = True
        current_anchors = anchors.copy()
        
        while merged:
            merged = False
            new_anchors = []
            used_indices = set()
            
            for i in range(len(current_anchors)):
                if i in used_indices:
                    continue
                
                merged_with_any = False
                for j in range(i + 1, len(current_anchors)):
                    if j in used_indices:
                        continue
                    
                    if should_merge_anchors(
                        current_anchors[i],
                        current_anchors[j],
                        self.merging_distance_km
                    ):
                        # Merge i and j
                        merged_anchor = merge_anchors(current_anchors[i], current_anchors[j])
                        new_anchors.append(merged_anchor)
                        used_indices.add(i)
                        used_indices.add(j)
                        merged_with_any = True
                        merged = True
                        break
                
                if not merged_with_any:
                    new_anchors.append(current_anchors[i])
                    used_indices.add(i)
            
            current_anchors = new_anchors
        
        return current_anchors
    
    def get_active_anchors(
        self,
        user_id: str,
        current_time: datetime
    ) -> List[Tuple[LocationAnchor, float]]:
        """
        Get active anchors with strength scores for a user
        
        Returns:
            List of (anchor, strength_score) tuples, sorted by strength (descending)
        """
        if user_id not in self.user_anchors:
            return []
        
        active_anchors = []
        
        for anchor in self.user_anchors[user_id]:
            days_since_visit = (current_time - anchor.latest_visit).days
            
            # Filter out very old anchors
            if days_since_visit > self.anchor_max_age_days:
                continue
            
            # Calculate strength
            strength = calculate_anchor_strength(
                days_since_last_visit=days_since_visit,
                visit_count=anchor.visit_count,
                spatial_tightness=anchor.spatial_std
            )
            
            # Only include if above minimum threshold
            if strength >= self.min_anchor_strength:
                active_anchors.append((anchor, strength))
        
        # Sort by strength (descending)
        active_anchors.sort(key=lambda x: x[1], reverse=True)
        
        # Return top 3 strongest anchors
        return active_anchors[:3]
    
    def get_ranking_context(
        self,
        user_id: str,
        current_time: datetime
    ) -> RankingContext:
        """
        Get ranking context for restaurant recommendation
        
        This determines:
        - What mode to use (global/local/personalized)
        - What anchors to use
        - What radius to search
        """
        anchors = self.get_active_anchors(user_id, current_time)
        
        if len(anchors) == 0:
            # TRUE cold start - no location anchors yet
            return RankingContext(
                mode='global_popular',
                signal_strength='no_history',
                anchors=[],
                primary_radius_km=0,
                extended_radius_km=0,
                min_personalization_score=0.0
            )
        
        elif len(anchors) == 1 and anchors[0][0].visit_count == 1:
            # Weak signal - single visit only
            return RankingContext(
                mode='local_explore',
                signal_strength='single_visit',
                anchors=anchors,
                primary_radius_km=self.primary_radius_km * 1.5,  # Wider search
                extended_radius_km=self.extended_radius_km,
                min_personalization_score=0.85
            )
        
        else:
            # Strong signal - multiple visits or merged anchors
            return RankingContext(
                mode='personalized',
                signal_strength='established',
                anchors=anchors,
                primary_radius_km=self.primary_radius_km,
                extended_radius_km=self.extended_radius_km,
                min_personalization_score=0.90
            )
    
    def get_candidate_restaurants(
        self,
        user_id: str,
        all_restaurants: pd.DataFrame,
        current_time: datetime,
        personalization_scores: Optional[Dict[str, float]] = None
    ) -> Dict[str, pd.DataFrame]:
        """
        Get candidate restaurants for ranking
        
        Args:
            user_id: User identifier
            all_restaurants: DataFrame with columns [business_id, latitude, longitude, ...]
            current_time: Current timestamp
            personalization_scores: Optional dict of {business_id: personalization_score}
        
        Returns:
            Dictionary with 'primary' and 'extended' candidate DataFrames
        """
        context = self.get_ranking_context(user_id, current_time)
        
        if context.mode == 'global_popular':
            # Cold start - return popular restaurants (no location filter)
            return {
                'primary': all_restaurants.copy(),
                'extended': pd.DataFrame(),
                'context': context
            }
        
        # Filter restaurants by distance to anchors
        primary_candidates = []
        extended_candidates = []
        
        for _, restaurant in all_restaurants.iterrows():
            rest_lat = restaurant['latitude']
            rest_lon = restaurant['longitude']
            rest_id = restaurant['business_id']
            
            # Find minimum distance to any anchor
            min_distance = float('inf')
            for anchor, strength in context.anchors:
                dist = haversine_distance(
                    rest_lat, rest_lon,
                    anchor.centroid_lat, anchor.centroid_lon
                )
                min_distance = min(min_distance, dist)
            
            # Add distance column
            restaurant_with_dist = restaurant.copy()
            restaurant_with_dist['min_anchor_distance_km'] = min_distance
            
            # Classify as primary or extended
            if min_distance <= context.primary_radius_km:
                primary_candidates.append(restaurant_with_dist)
            
            elif min_distance <= context.extended_radius_km:
                # Only include if personalization score is high enough
                if personalization_scores is not None:
                    pers_score = personalization_scores.get(rest_id, 0.0)
                    if pers_score >= context.min_personalization_score:
                        extended_candidates.append(restaurant_with_dist)
                else:
                    # Without personalization scores, include all in extended
                    extended_candidates.append(restaurant_with_dist)
        
        return {
            'primary': pd.DataFrame(primary_candidates),
            'extended': pd.DataFrame(extended_candidates),
            'context': context
        }
    
    def save_anchors(self, filepath: str):
        """Save anchors to JSON file"""
        data = {
            user_id: [anchor.to_dict() for anchor in anchors]
            for user_id, anchors in self.user_anchors.items()
        }
        
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Saved anchors for {len(data)} users to {filepath}")
    
    def load_anchors(self, filepath: str):
        """Load anchors from JSON file"""
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        self.user_anchors = {
            user_id: [LocationAnchor.from_dict(a) for a in anchors]
            for user_id, anchors in data.items()
        }
        
        print(f"Loaded anchors for {len(self.user_anchors)} users from {filepath}")
    
    def get_statistics(self) -> Dict:
        """Get statistics about the anchor system"""
        all_anchors = [a for anchors in self.user_anchors.values() for a in anchors]
        
        if not all_anchors:
            return {}
        
        return {
            'total_users': len(self.user_anchors),
            'total_anchors': len(all_anchors),
            'avg_anchors_per_user': np.mean([len(a) for a in self.user_anchors.values()]),
            'users_with_single_anchor': sum(1 for a in self.user_anchors.values() if len(a) == 1),
            'users_with_multiple_anchors': sum(1 for a in self.user_anchors.values() if len(a) > 1),
            'avg_visits_per_anchor': np.mean([a.visit_count for a in all_anchors]),
            'avg_spatial_std_km': np.mean([a.spatial_std for a in all_anchors]),
            'max_spatial_std_km': np.max([a.spatial_std for a in all_anchors]),
        }


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Example usage
    
    # Create sample data
    sample_data = pd.DataFrame({
        'user_id': ['user1', 'user1', 'user1', 'user2', 'user2', 'user3'],
        'business_id': ['rest1', 'rest2', 'rest3', 'rest4', 'rest5', 'rest6'],
        'latitude': [40.7128, 40.7580, 40.7489, 43.6532, 43.6511, 34.0522],
        'longitude': [-74.0060, -73.9855, -73.9680, -79.3832, -79.3470, -118.2437],
        'tip_date': pd.to_datetime([
            '2024-01-15', '2024-02-20', '2024-03-10',
            '2024-01-10', '2024-06-15',
            '2024-05-01'
        ])
    })
    
    # Initialize manager
    manager = LocationAnchorManager(
        merging_distance_km=50.0,
        primary_radius_km=40.0,
        extended_radius_km=100.0
    )
    
    # Build anchors
    manager.build_anchors_from_dataframe(sample_data)
    
    # Get ranking context for a user
    current_time = datetime(2024, 12, 1)
    context = manager.get_ranking_context('user1', current_time)
    
    print(f"\nRanking Context for user1:")
    print(f"Mode: {context.mode}")
    print(f"Signal Strength: {context.signal_strength}")
    print(f"Number of Active Anchors: {len(context.anchors)}")
    
    # Show statistics
    stats = manager.get_statistics()
    print(f"\nSystem Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")

Building anchors for 3 users...
Created 6 total anchors
Average anchors per user: 2.00

Ranking Context for user1:
Mode: global_popular
Signal Strength: no_history
Number of Active Anchors: 0

System Statistics:
  total_users: 3
  total_anchors: 6
  avg_anchors_per_user: 2.0
  users_with_single_anchor: 1
  users_with_multiple_anchors: 2
  avg_visits_per_anchor: 1.0
  avg_spatial_std_km: 0.0
  max_spatial_std_km: 0.0


In [None]:
sample_data = pd.DataFrame({
        'user_id': ['user1', 'user1', 'user1', 'user2', 'user2', 'user3'],
        'business_id': ['rest1', 'rest2', 'rest3', 'rest4', 'rest5', 'rest6'],
        'latitude': [40.7128, 40.7580, 40.7489, 43.6532, 43.6511, 34.0522],
        'longitude': [-74.0060, -73.9855, -73.9680, -79.3832, -79.3470, -118.2437],
        'tip_date': pd.to_datetime([
            '2024-01-15', '2024-02-20', '2024-03-10',
            '2024-01-10', '2024-06-15',
            '2024-05-01'
        ])
    })
    
    # Initialize manager
manager = LocationAnchorManager(
        merging_distance_km=50.0,
        primary_radius_km=40.0,
        extended_radius_km=100.0
    )
    
    # Build anchors
    #     
manager.build_anchors_from_dataframe(sample_data)
    
    # Get ranking context for a user
current_time = datetime(2024, 12, 1)
context = manager.get_ranking_context('user1', current_time)
    
print(f"\nRanking Context for user1:")
print(f"Mode: {context.mode}")
print(f"Signal Strength: {context.signal_strength}")
print(f"Number of Active Anchors: {len(context.anchors)}")
    
    # Show statistics
stats = manager.get_statistics()
print(f"\nSystem Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")