In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, Point
import geopy.distance
import geocoder
from tqdm import tqdm


In [2]:
df_train = pd.read_csv('train.csv', index_col=[0]).dropna()
df_test = pd.read_csv('test.csv', index_col=[0])

In [103]:
# concat_df = pd.concat([df_train,df_test])
concat_df = pd.read_csv('concat_df.csv')
concat_df.head()

Unnamed: 0.1,Unnamed: 0,id,atm_group,address,address_rus,lat,long,target
0,0,8526.0,32.0,"EMELYANOVA,34 Y-SAKHALINSK","улица А.О. Емельянова, 34, Южно-Сахалинск, Сах...",46.940995,142.738319,0.0115
1,1,8532.0,32.0,"KOMSOMOLSKAYA,259B Y.SAKHALINSK","Комсомольская улица, 259, Южно-Сахалинск, Саха...",46.937353,142.753348,0.02971
2,2,8533.0,32.0,"KOMMUN. PR., 32 YUZHNO SAKHAL","Коммунистический проспект, Южно-Сахалинск, Сах...",46.959413,142.741113,0.00954
3,3,8684.0,32.0,"LENINGRADSKIY PR.,76A MOSCOW","Ленинградский проспект, 76А, Москва, Россия, 1...",55.805827,37.515146,-0.094035
4,4,37.0,32.0,"GVARDEYSKAYA PL., 2 NORILSK","Гвардейская площадь, 2, Норильск, Красноярский...",69.343541,88.211228,0.079277


In [6]:
# Список OSM тэгов
tag_list = [
    {'subway': ['yes']},
    {'public_transport': ['stop_position']},
    {'government': ['administrative']},
    {'amenity': ['college', 'university']},
    {'amenity': ['school']},
    {'amenity': ['kindergarten', 'childcare']},
    {'office': ['company']},
    {'amenity': ['cafe', 'fast_food', 'bar', 'pub', 'canteen']}
]

# Human-readable имена тегов 
tag_names = [
    'subway',
    'bus_stop',
    'gov_building',
    'college',
    'school',
    'kgarden',
    'office',
    'food'
]



In [76]:
import overpy
import time
api = overpy.Overpass()

def flatten(l):
    return [item for sublist in l for item in sublist]

def query(api,radius,lat,lon):
    try:
        result = api.query("""
    (
        node["subway"="yes"](around:{radius},{lat}, {lon});
        node["public_transport"="stop_position"](around:{radius},{lat}, {lon});
        node["government"="administrative"](around:{radius},{lat}, {lon});
        node["amenity"~"^(college|university)$"](around:{radius},{lat}, {lon});
        node["amenity"="school"](around:{radius},{lat}, {lon});
        node["amenity"~"^(kindergarten|childcare)$"](around:{radius},{lat}, {lon});
        node["office"="company"](around:{radius},{lat}, {lon});
        node["amenity"~"^(cafe|fast_food|bar|pub|canteen)$"](around:{radius},{lat}, {lon});
    );
    out body;
            """.format(radius=radius,lat=lat,lon=lon))
    except Exception as e:
        time.sleep(30)
        result = api.query("""
    (
        node["subway"="yes"](around:{radius},{lat}, {lon});
        node["public_transport"="stop_position"](around:{radius},{lat}, {lon});
        node["government"="administrative"](around:{radius},{lat}, {lon});
        node["amenity"~"^(college|university)$"](around:{radius},{lat}, {lon});
        node["amenity"="school"](around:{radius},{lat}, {lon});
        node["amenity"~"^(kindergarten|childcare)$"](around:{radius},{lat}, {lon});
        node["office"="company"](around:{radius},{lat}, {lon});
        node["amenity"~"^(cafe|fast_food|bar|pub|canteen)$"](around:{radius},{lat}, {lon});
    );
    out body;
            """.format(radius=radius,lat=lat,lon=lon))
    return result

# res = query(api,10000,55.805827,37.515146)

def find_tag(node):
    for i in range(len(tag_list)):
        if list(tag_list[i].keys())[0] in node.tags:
            if node.tags[list(tag_list[i].keys())[0]] in tag_list[i][list(tag_list[i].keys())[0]]:
                return tag_names[i]
    raise ValueError('tag not found ',node.tags)

def process_query(idx,api,lat,lon):
    res = query(api,10000,lat,lon)
    
    row = dict.fromkeys(flatten([*map(lambda x: [x+'_min_dist',x+'_250m',x+'_500m'],tag_names)]),0)
    row.update({'id':idx})
    for key in row:
        if '_min_dist' in key:
            row[key] = 10000
    for node in res.nodes:
        coords_1 = (lat,lon)
        coords_2 = (node.lat,node.lon)
        distance = geopy.distance.geodesic(coords_1, coords_2).m
        try:
            tag = find_tag(node)
        
            if(distance <= 250):
                row[tag+'_250m']+=1
            if(distance <= 500):
                row[tag+'_500m']+=1
            if(distance <row[tag+'_min_dist']):
                row[tag+'_min_dist'] = distance
        except ValueError:
            print(node.tags)
        
    return row

In [77]:
process_query('8684',api,55.805827,37.515146)

{'subway_min_dist': 59.371791355662175,
 'subway_250m': 3,
 'subway_500m': 3,
 'bus_stop_min_dist': 60.155842572180674,
 'bus_stop_250m': 2,
 'bus_stop_500m': 12,
 'gov_building_min_dist': 3743.08564566892,
 'gov_building_250m': 0,
 'gov_building_500m': 0,
 'college_min_dist': 1280.980381544977,
 'college_250m': 0,
 'college_500m': 0,
 'school_min_dist': 841.1426736863608,
 'school_250m': 0,
 'school_500m': 0,
 'kgarden_min_dist': 105.7005905391319,
 'kgarden_250m': 2,
 'kgarden_500m': 3,
 'office_min_dist': 112.12425770079038,
 'office_250m': 2,
 'office_500m': 9,
 'food_min_dist': 5.6248526530663385,
 'food_250m': 12,
 'food_500m': 36,
 'id': '8684'}

In [104]:
# rows = []

for index, row in tqdm(concat_df.iterrows(), total=concat_df.shape[0]):
    if row['id'] not in [*map(lambda x: x['id'],rows)]:
        row = process_query(row['id'],api,row['lat'],row['long'])
        rows.append(row)

100%|███████████████████████████████████████| 8461/8461 [32:05<00:00,  4.39it/s]


In [105]:
adj_df = pd.DataFrame(rows)
adj_df.to_csv('adj_df.csv')
adj_df.head()

Unnamed: 0,subway_min_dist,subway_250m,subway_500m,bus_stop_min_dist,bus_stop_250m,bus_stop_500m,gov_building_min_dist,gov_building_250m,gov_building_500m,college_min_dist,...,kgarden_min_dist,kgarden_250m,kgarden_500m,office_min_dist,office_250m,office_500m,food_min_dist,food_250m,food_500m,id
0,10000.0,0,0,829.198383,0,0,10000.0,0,0,10000.0,...,1254.875056,0,0,1086.833836,0,0,688.469156,0,0,8526.0
1,10000.0,0,0,1376.218766,0,0,10000.0,0,0,10000.0,...,79.966472,1,1,732.490315,0,0,197.747969,1,1,8532.0
2,10000.0,0,0,1203.57463,0,0,10000.0,0,0,10000.0,...,850.505033,0,0,526.584497,0,0,174.983219,2,7,8533.0
3,59.371791,3,3,60.155843,2,12,3743.085646,0,0,1280.980382,...,105.700591,2,3,112.124258,2,9,5.624853,12,36,8684.0
4,10000.0,0,0,57.913245,7,14,10000.0,0,0,1279.238327,...,1093.540862,0,0,119.987527,4,5,103.082008,3,7,37.0


In [106]:
adj_df

Unnamed: 0,subway_min_dist,subway_250m,subway_500m,bus_stop_min_dist,bus_stop_250m,bus_stop_500m,gov_building_min_dist,gov_building_250m,gov_building_500m,college_min_dist,...,kgarden_min_dist,kgarden_250m,kgarden_500m,office_min_dist,office_250m,office_500m,food_min_dist,food_250m,food_500m,id
0,10000.000000,0,0,829.198383,0,0,10000.000000,0,0,10000.000000,...,1254.875056,0,0,1086.833836,0,0,688.469156,0,0,8526.0
1,10000.000000,0,0,1376.218766,0,0,10000.000000,0,0,10000.000000,...,79.966472,1,1,732.490315,0,0,197.747969,1,1,8532.0
2,10000.000000,0,0,1203.574630,0,0,10000.000000,0,0,10000.000000,...,850.505033,0,0,526.584497,0,0,174.983219,2,7,8533.0
3,59.371791,3,3,60.155843,2,12,3743.085646,0,0,1280.980382,...,105.700591,2,3,112.124258,2,9,5.624853,12,36,8684.0
4,10000.000000,0,0,57.913245,7,14,10000.000000,0,0,1279.238327,...,1093.540862,0,0,119.987527,4,5,103.082008,3,7,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8456,10000.000000,0,0,363.699181,0,3,6928.489571,0,0,1979.172678,...,2508.324696,0,0,662.235480,0,0,635.152036,0,0,3199.0
8457,10000.000000,0,0,666.129410,0,0,6504.445618,0,0,4452.938422,...,3101.956893,0,0,1127.837214,0,0,2899.964963,0,0,5354.0
8458,10000.000000,0,0,31.505167,8,9,3148.939027,0,0,803.748157,...,494.980690,0,1,273.810838,0,1,106.815903,4,7,3333.0
8459,10000.000000,0,0,822.801779,0,0,10000.000000,0,0,3539.252622,...,3486.211164,0,0,169.837228,1,2,600.168287,0,0,2247.0


In [102]:
rows = [*filter(lambda row: row['id'] not in adj_df[adj_df.id.isin(concat_df[concat_df.lat.isna()].id)].id.to_list(),rows)]



[{'subway_min_dist': 10000,
  'subway_250m': 0,
  'subway_500m': 0,
  'bus_stop_min_dist': 829.1983827178741,
  'bus_stop_250m': 0,
  'bus_stop_500m': 0,
  'gov_building_min_dist': 10000,
  'gov_building_250m': 0,
  'gov_building_500m': 0,
  'college_min_dist': 10000,
  'college_250m': 0,
  'college_500m': 0,
  'school_min_dist': 533.866601995004,
  'school_250m': 0,
  'school_500m': 0,
  'kgarden_min_dist': 1254.8750563032816,
  'kgarden_250m': 0,
  'kgarden_500m': 0,
  'office_min_dist': 1086.833836204933,
  'office_250m': 0,
  'office_500m': 0,
  'food_min_dist': 688.4691556573313,
  'food_250m': 0,
  'food_500m': 0,
  'id': 8526.0},
 {'subway_min_dist': 10000,
  'subway_250m': 0,
  'subway_500m': 0,
  'bus_stop_min_dist': 1376.2187659440651,
  'bus_stop_250m': 0,
  'bus_stop_500m': 0,
  'gov_building_min_dist': 10000,
  'gov_building_250m': 0,
  'gov_building_500m': 0,
  'college_min_dist': 10000,
  'college_250m': 0,
  'college_500m': 0,
  'school_min_dist': 1359.2987334757202,
  