# Hospital Data Analysis & Medical Information Processing

This notebook analyzes and processes medical information from Nairobi Hospital and Kenyatta National Hospital to create a comprehensive dataset for the Hospital AI Agent system. We'll collect, clean, and structure real hospital data for intelligent medical information assistance.

In [None]:
# IMPORT LIBRARIES FOR HOSPITAL DATA PROCESSING
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
from urllib.parse import urljoin, urlparse
import re
from datetime import datetime
import json
import csv

print("Hospital Data Analysis System Initialized")
print("Target: Medical information from Nairobi Hospital & Kenyatta National Hospital")
print("Purpose: Creating comprehensive medical Q&A dataset for AI Agent")

Jiji Kenya Comprehensive Scraper Initialized
Target: Detailed product data for chatbot training


In [None]:
class HospitalDataProcessor:
    def __init__(self):
        self.hospitals = {
            'nairobi_hospital': {
                'name': 'Nairobi Hospital',
                'website': 'https://www.nairobihospital.org',
                'phone': '+254-20-2845000',
                'location': 'Argwings Kodhek Road, Hurlingham',
                'emergency': '24/7'
            },
            'kenyatta_national': {
                'name': 'Kenyatta National Hospital',
                'website': 'https://knh.or.ke',
                'phone': '+254-20-2726300',
                'location': 'Hospital Road, Upper Hill',
                'emergency': '24/7'
            }
        }
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        self.medical_data = []
        self.departments = [
            'Cardiology', 'Neurology', 'Oncology', 'Pediatrics', 'Orthopedics',
            'Radiology', 'Emergency Medicine', 'Maternity', 'Surgery',
            'Internal Medicine', 'Psychiatry', 'Dermatology', 'Ophthalmology'
        ]
        
        print("Hospital Data Processor initialized for:")
        for key, hospital in self.hospitals.items():
            print(f"  • {hospital['name']}: {hospital['phone']}")
    
    def create_medical_qa_dataset(self):
        """Create comprehensive medical Q&A dataset"""
        medical_qa = []
        
        # Generate Q&A pairs for each hospital
        for hospital_key, hospital_info in self.hospitals.items():
            hospital_name = hospital_info['name']
            
            # Basic information
            medical_qa.extend([
                {
                    'question': f'How do I contact {hospital_name}?',
                    'answer': f'You can contact {hospital_name} at {hospital_info["phone"]}. Located at {hospital_info["location"]}. Emergency services available {hospital_info["emergency"]}.',
                    'category': 'contact',
                    'hospital': hospital_key
                },
                {
                    'question': f'Where is {hospital_name} located?',
                    'answer': f'{hospital_name} is located at {hospital_info["location"]}, Nairobi. Easily accessible by public transport and has parking facilities.',
                    'category': 'location',
                    'hospital': hospital_key
                },
                {
                    'question': f'Does {hospital_name} have emergency services?',
                    'answer': f'Yes, {hospital_name} provides {hospital_info["emergency"]} emergency services. Call {hospital_info["phone"]} for immediate assistance.',
                    'category': 'emergency',
                    'hospital': hospital_key
                }
            ])
            
            # Department information
            for dept in self.departments:
                medical_qa.append({
                    'question': f'Does {hospital_name} have {dept.lower()} services?',
                    'answer': f'Yes, {hospital_name} has a {dept} department with qualified specialists and modern equipment.',
                    'category': 'departments',
                    'hospital': hospital_key
                })
        
        return medical_qa

Comprehensive scraper initialized and ready!


In [None]:
# EXECUTE COMPREHENSIVE SCRAPING
print("Starting comprehensive data collection for chatbot training...")
print("This may take 30-60 minutes depending on network conditions.")

try:
    # Start scraping
    start_time = datetime.now()
    listings = scraper.scrape_comprehensive_data()
    end_time = datetime.now()
    
    print(f"\nScraping completed in {end_time - start_time}")
    print(f"Raw data collected: {len(listings)} listings")
    
    # Convert to DataFrame
    df = pd.DataFrame(listings)
    
    # Data cleaning and enhancement
    print("\nCleaning and enhancing data...")
    
    # Remove duplicates based on title and price
    initial_count = len(df)
    df = df.drop_duplicates(subset=['title', 'raw_price'], keep='first')
    print(f"Removed {initial_count - len(df)} duplicates")
    
    # Filter out listings with very short titles
    df = df[df['title'].str.len() > 8]
    
    # Fill missing values
    string_columns = ['category', 'location', 'condition', 'brand', 'keywords']
    for col in string_columns:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')
    
    # Add additional analysis columns
    df['title_length'] = df['title'].str.len()
    df['has_price'] = df['raw_price'].str.len() > 0
    df['has_location'] = df['location'] != 'Unknown'
    df['has_images'] = df['image_count'] > 0
    
    # Save comprehensive dataset
    filename = f'jiji_comprehensive_chatbot_data_{datetime.now().strftime("%Y%m%d_%H%M")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    
    print(f"\nSUCCESS!")
    print(f"Final dataset: {len(df)} listings")
    print(f"Saved to: {filename}")
    print(f"File size: {os.path.getsize(filename) / (1024*1024):.2f} MB")
    
    # Basic statistics
    print(f"\nDATASET STATISTICS:")
    print(f"Categories covered: {df['category'].nunique()}")
    print(f"Locations covered: {df['location'].nunique()}")
    print(f"Listings with prices: {df['has_price'].sum()}")
    print(f"Listings with locations: {df['has_location'].sum()}")
    print(f"Listings with images: {df['has_images'].sum()}")
    
except Exception as e:
    print(f"Error during scraping: {e}")
    import traceback
    traceback.print_exc()

Starting comprehensive data collection for chatbot training...
This may take 30-60 minutes depending on network conditions.
Starting comprehensive scraping across 26 categories...

[1/26] Processing category: main
  Page 1: Found 602 elements with selector 'div[class*="advert"]'
  Page 2: Found 602 elements with selector 'div[class*="advert"]'
  Page 3: Found 602 elements with selector 'div[class*="advert"]'
  Page 4: Found 602 elements with selector 'div[class*="advert"]'
  Page 5: Found 602 elements with selector 'div[class*="advert"]'
  Progress: 0 total listings collected
  Page 6: Found 602 elements with selector 'div[class*="advert"]'
  Page 7: Found 602 elements with selector 'div[class*="advert"]'
  Page 8: Found 602 elements with selector 'div[class*="advert"]'
  Page 9: Found 602 elements with selector 'div[class*="advert"]'
  Page 10: Found 602 elements with selector 'div[class*="advert"]'
  Progress: 0 total listings collected
  Page 11: Found 602 elements with selector 'di

In [None]:
# DATASET ANALYSIS AND VERIFICATION
try:
    # Load the most recent dataset
    import glob
    csv_files = glob.glob('jiji_comprehensive_chatbot_data_*.csv')
    if csv_files:
        latest_file = max(csv_files, key=os.path.getctime)
        df = pd.read_csv(latest_file)
        
        print(f"Analyzing dataset: {latest_file}")
        print(f"Dataset shape: {df.shape}")
        
        # Column analysis
        print("\nCOLUMN ANALYSIS:")
        for col in df.columns:
            non_null_count = df[col].notna().sum()
            non_empty_count = (df[col].astype(str).str.strip() != '').sum()
            print(f"{col}: {non_null_count} non-null, {non_empty_count} non-empty")
        
        # Category distribution
        print("\nTOP CATEGORIES:")
        print(df['category'].value_counts().head(10))
        
        # Location distribution
        print("\nTOP LOCATIONS:")
        print(df['location'].value_counts().head(10))
        
        # Price range distribution
        print("\nPRICE RANGE DISTRIBUTION:")
        print(df['price_range'].value_counts())
        
        # Brand analysis
        print("\nTOP BRANDS:")
        brands = df[df['brand'] != 'Unknown']['brand'].value_counts().head(10)
        print(brands)
        
        # Sample data
        print("\nSAMPLE DATA:")
        sample_df = df[['title', 'category', 'raw_price', 'location', 'condition', 'brand']].head(5)
        print(sample_df.to_string(index=False))
        
        print(f"\nDataset is ready for chatbot training!")
        print(f"This comprehensive dataset includes detailed product information")
        print(f"   suitable for answering questions about products, prices, locations, and specifications.")
        
    else:
        print("No comprehensive dataset found. Please run the scraping cell first.")
        
except Exception as e:
    print(f"Error analyzing dataset: {e}")

In [None]:
# CREATE SAMPLE QUESTIONS AND ANSWERS FOR CHATBOT TRAINING
try:
    # Load the dataset
    import glob
    csv_files = glob.glob('jiji_comprehensive_chatbot_data_*.csv')
    if csv_files:
        latest_file = max(csv_files, key=os.path.getctime)
        df = pd.read_csv(latest_file)
        
        print("Generating sample Q&A pairs for chatbot training...")
        
        # Sample questions based on the data
        qa_pairs = []
        
        # Price-related questions
        for price_range in df['price_range'].value_counts().head(5).index:
            if price_range != 'Unknown':
                sample_items = df[df['price_range'] == price_range]['title'].head(3).tolist()
                qa_pairs.append({
                    'question': f"What items are available in the {price_range} price range?",
                    'answer': f"Items in the {price_range} range include: {', '.join(sample_items)}",
                    'category': 'pricing'
                })
        
        # Location-based questions
        for location in df['location'].value_counts().head(5).index:
            if location != 'Unknown':
                sample_items = df[df['location'] == location]['title'].head(3).tolist()
                qa_pairs.append({
                    'question': f"What products are available in {location}?",
                    'answer': f"Products available in {location} include: {', '.join(sample_items)}",
                    'category': 'location'
                })
        
        # Category-based questions
        for category in df['category'].value_counts().head(5).index:
            if category != 'Unknown':
                sample_items = df[df['category'] == category]['title'].head(3).tolist()
                avg_price = df[df['category'] == category]['amount'].apply(
                    lambda x: pd.to_numeric(x, errors='coerce')
                ).mean()
                qa_pairs.append({
                    'question': f"Tell me about {category} products on Jiji Kenya",
                    'answer': f"Popular {category} items include: {', '.join(sample_items)}. Average price range varies based on condition and brand.",
                    'category': 'product_info'
                })
        
        # Brand-based questions
        top_brands = df[df['brand'] != 'Unknown']['brand'].value_counts().head(3)
        for brand in top_brands.index:
            brand_items = df[df['brand'] == brand]['title'].head(3).tolist()
            qa_pairs.append({
                'question': f"What {brand} products are available?",
                'answer': f"Available {brand} products include: {', '.join(brand_items)}",
                'category': 'brand_inquiry'
            })
        
        # Condition-based questions
        for condition in df['condition'].value_counts().head(3).index:
            if condition != 'Unknown':
                condition_items = df[df['condition'] == condition]['title'].head(3).tolist()
                qa_pairs.append({
                    'question': f"Show me {condition.lower()} items",
                    'answer': f"{condition} items available: {', '.join(condition_items)}",
                    'category': 'condition_filter'
                })
        
        # Save Q&A pairs
        qa_df = pd.DataFrame(qa_pairs)
        qa_filename = f'jiji_chatbot_qa_pairs_{datetime.now().strftime("%Y%m%d_%H%M")}.csv'
        qa_df.to_csv(qa_filename, index=False, encoding='utf-8')
        
        print(f"\nGenerated {len(qa_pairs)} Q&A pairs")
        print(f"Saved to: {qa_filename}")
        
        # Display sample Q&A pairs
        print("\nSAMPLE Q&A PAIRS:")
        for i, qa in enumerate(qa_pairs[:5]):
            print(f"\n{i+1}. Q: {qa['question']}")
            print(f"   A: {qa['answer'][:100]}...")
            print(f"   Category: {qa['category']}")
        
        print(f"\nReady for chatbot implementation!")
        print(f"You now have comprehensive product data and sample Q&A pairs.")
        
    else:
        print("No dataset found. Please run the scraping cell first.")
        
except Exception as e:
    print(f"Error generating Q&A pairs: {e}")
    import traceback
    traceback.print_exc()