In [2]:
import pandas as pd

# Step 1: Load data
df = pd.read_csv("all_pois_combined(13places_around).csv", low_memory=False)

# Step 2: Keep essential columns
columns_to_keep = [
    "location_name", "id", "type", "lat", "lon", "name",
    "amenity", "tourism", "leisure", "shop",
    
]
df_cleaned = df[columns_to_keep].copy()

# Step 3: Remove POIs without any category info
df_cleaned = df_cleaned.dropna(subset=["amenity", "tourism", "leisure", "shop"], how="all")

# Step 4: Create unified 'category' field
def merge_category(row):
    for col in ["amenity", "tourism", "leisure", "shop"]:
        if pd.notnull(row[col]):
            return row[col]
    return "unknown"

df_cleaned["category"] = df_cleaned.apply(merge_category, axis=1)

# Step 5: Drop rows with missing name or coordinates
df_cleaned = df_cleaned.dropna(subset=["name", "lat", "lon"])

# Step 6: Drop duplicate rows
df_cleaned = df_cleaned.drop_duplicates(subset=["name", "lat", "lon"])

# Step 7: Normalize the 'category' field (lowercase, underscores)
df_cleaned["category"] = df_cleaned["category"].str.lower().str.replace(" ", "_")

# Step 8: Add binary feature: is_tourist_spot
tourist_keywords = [
    "museum", "artwork", "attraction", "viewpoint", "zoo", "theme_park", "aquarium",
    "memorial", "castle", "monument", "historic", "heritage"
]
df_cleaned["is_tourist_spot"] = df_cleaned["category"].apply(
    lambda x: int(any(keyword in x for keyword in tourist_keywords))
)

# Optional Step 9: Filter out 'unknown' category rows if not needed
df_cleaned = df_cleaned[df_cleaned["category"] != "unknown"].copy()

# Step 10: Reset index and check top categories
df_cleaned.reset_index(drop=True, inplace=True)
print("Top 20 categories:\n", df_cleaned["category"].value_counts().head(20))

# Step 11: Save cleaned dataset
df_cleaned.to_csv("cleaned_pois_for_ml_ready.csv", index=False)
print("✅ Cleaned and enriched dataset saved as cleaned_pois_for_ml_ready.csv")


Top 20 categories:
 restaurant          567
fast_food           285
cafe                245
hotel               201
clothes             182
gallery             117
bank                115
bicycle_rental       90
bar                  72
jewelry              70
artwork              70
theatre              60
pub                  58
gift                 57
place_of_worship     46
shoes                41
attraction           38
convenience          37
beauty               32
hairdresser          31
Name: category, dtype: int64
✅ Cleaned and enriched dataset saved as cleaned_pois_for_ml_ready.csv
