In [1]:
import pandas as pd

In [2]:
csv_path = '/Users/Jiwon/Documents/GitHub/advanced_project/jiwon_project/1.preprocessing/entire_remove_outlier.csv'    # 여기에 absolute path
# CSV 읽기
df = pd.read_csv(
    csv_path,
    header=0,        # 첫 줄을 컬럼명으로 사용,  # 인덱스 컬럼으로 id 지정
    encoding='utf-8-sig'
)

In [3]:
import ast

# 1) host_verifications 리스트 문자열 → 실제 파이썬 리스트로 변환
def parse_verifications(x):
    try:
        # 문자열 "[...]" → list([...])
        lst = ast.literal_eval(x)
        return lst if isinstance(lst, list) else []
    except:
        return []

df['host_verifications_list'] = df['host_verifications'].apply(parse_verifications)

# 2) 리스트 길이(검증 개수)를 숫자로 저장
df['host_verifications_count'] = df['host_verifications_list'].apply(len)

In [4]:
import time
import json
import pandas as pd
import requests

import geopandas as gpd
from shapely.geometry import Point

# ─────────────────────────────────────────────────────────────────────────────
# 0) 원본 df, poi_tags, 그리고 bbox 계산
  # latitude, longitude 칼럼이 있어야 함

poi_tags = {
    'transport': {
        'amenity': ['bus_station','taxi'],
        'railway': ['station']
    },
    'infrastructure': {
        'amenity': ['police','hospital','pharmacy','restaurant','supermarket']
    },
    'tourism': {
        'tourism': ['viewpoint','museum','attraction'],
        'leisure': ['park']
    }
}

pad = 0.01
minx, maxx = df.longitude.min()-pad, df.longitude.max()+pad
miny, maxy = df.latitude.min()-pad, df.latitude.max()+pad

# ─────────────────────────────────────────────────────────────────────────────
# 1) 한 번에 bbox 내 모든 POI 내려받기 (Overpass bbox 쿼리)
OVERPASS_URL = "http://overpass-api.de/api/interpreter"
# build filters for bbox query
filters = ""
for grp in poi_tags.values():
    for key, vals in grp.items():
        for v in vals:
            filters += f'node["{key}"="{v}"]({miny},{minx},{maxy},{maxx});\n'

# full query
query = f"""
[out:json][timeout:180];
(
{filters}
);
out body;
"""

resp = requests.post(OVERPASS_URL, data={'data': query}, timeout=(5,300))
resp.raise_for_status()
data = resp.json().get('elements', [])

# ─────────────────────────────────────────────────────────────────────────────
# 2) GeoDataFrame 생성
pois = pd.DataFrame([
    {
      'lon': el['lon'],
      'lat': el['lat'],
      **el.get('tags',{})
    }
    for el in data
    if el['type']=='node' and 'lon' in el
])
gdf_pois = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(pois.lon, pois.lat),
    crs="EPSG:4326"
).to_crs(epsg=3857)

# 원본 좌표도 GeoDataFrame
gdf_pts = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.longitude, df.latitude),
    crs="EPSG:4326"
).to_crs(epsg=3857)

sindex = gdf_pois.sindex

# ─────────────────────────────────────────────────────────────────────────────
# 3) 그룹별 카운트 함수
def count_group(pt, grp_map, radius=1000):
    buf = pt.buffer(radius)
    candidates = gdf_pois.iloc[list(sindex.intersection(buf.bounds))]
    cnt = 0
    for key, vals in grp_map.items():
        cnt += candidates[candidates[key].isin(vals)].shape[0]
    return cnt

# 4) 각 포인트별 count, df에 붙이기
for grp, tags in poi_tags.items():
    df[f"{grp}_count"] = [
        count_group(pt, tags, radius=1000)
        for pt in gdf_pts.geometry
    ]

# 5) 결과 확인
print(df[['transport_count','infrastructure_count','tourism_count']].head())


   transport_count  infrastructure_count  tourism_count
0                7                    77              1
1                3                    79              4
2                4                    34              2
3                7                   117              1
4               15                   335              3


In [5]:
from sklearn.decomposition import PCA

poi_cols = ['transport_count','infrastructure_count','tourism_count']
pca = PCA(n_components=1)

# PCA fit → PC1 점수 생성
df['poi_pca1'] = pca.fit_transform(df[poi_cols].fillna(0))

# 설명 분산 비율 확인 (얼마나 데이터의 변동성을 담았는지)
print("Explained variance ratio (PC1):", pca.explained_variance_ratio_[0])

#poi_pca1 <0 poi 희박 지역, poi_pca1 > 0 poi 밀집지역

Explained variance ratio (PC1): 0.9982082112260977


In [6]:
df

Unnamed: 0,source,name,description,neighborhood_overview,host_id,host_name,host_since,host_location,host_about,host_response_time,...,room_type_Shared room,structure_encoded,log_price,room_new_type,host_verifications_list,host_verifications_count,transport_count,infrastructure_count,tourism_count,poi_pca1
0,city scrape,Lg Rm in Historic Prospect Heights,Cozy space share in the heart of a great neigh...,Full of tree-lined streets and beautiful brown...,62165,Michael,2009,"New York, NY",I’m an urban planner working for an internatio...,,...,False,23,5.303305,mid,"[email, phone, work_email]",3,7,77,1,-37.863477
1,city scrape,"1 Bedroom & your own Bathroom, Elevator Apartment",Private bedroom with your own bathroom in a 2 ...,"Manhattan, SE corner of 2nd Ave/ E. 110th street",157798,Irene,2010,"New York, NY",,,...,False,8,4.418841,upper-mid,"[email, phone]",2,3,79,4,-35.914362
2,city scrape,Spectacular West Harlem Garden Apt,This is a very large and unique space. An inc...,West Harlem is now packed with great restauran...,166532,Matthew,2010,"New York, NY",I have been a New Yorker for a long time\n and...,within an hour,...,False,14,4.941642,low-mid,"[email, phone]",2,4,34,2,-80.892434
3,city scrape,“Work-from-home” from OUR home.,*Monthly Discount will automatically apply <br...,THE NEIGHBORHOOD:<br />Our apartment is locate...,168525,Gustavo,2010,"New York, NY",I am a music producer. And my wife is a hair s...,within an hour,...,False,23,4.875197,mid,"[email, phone]",2,7,117,1,2.094658
4,city scrape,1 br in a 2 br apt (Midtown West),,,169927,Hubert,2010,"Saint-Aubin-sur-Scie, France","Facebook Likes:\r\nNew York French Geek, David...",,...,False,23,4.941642,mid,"[email, phone]",2,15,335,3,220.203233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20555,city scrape,Beautiful 1-Bed Apt in Harlem!,Welcome to your temporary home in the vibrant ...,,40019013,Cecilia,2015,"New York, NY",,,...,False,23,4.290459,mid,"[email, phone]",2,4,58,4,-56.857764
20556,city scrape,Private Room w/ Ensuite Bath H,Stylish Private Rooms w/ En-Suite Baths in Bro...,,483056418,Kristina,2022,"New York, NY",Are you tired of the tedious and time-consumin...,within an hour,...,False,23,4.077537,mid,"[email, phone]",2,4,27,1,-87.915003
20557,city scrape,2 Bedroom on East Side,Located in the Murray Hill area and occupies a...,,30283594,Global Luxury Suites,2015,"Hawthorne, NJ",,within an hour,...,False,23,5.703782,mid,"[email, phone, work_email]",3,8,276,6,161.112336
20558,city scrape,Stylish 2Bd near Bryant Park,Enjoy everything the city has to offer while l...,,407304997,Boomerang,2021,"New York, NY","At Boomerang, we believe in the idea ""you get ...",within an hour,...,False,23,5.303305,mid,"[email, phone]",2,19,340,9,225.515844


In [8]:
df.to_csv('add_poi.csv', index=False)