In [None]:
import numpy as np
import pandas as pd
import json
import pickle
import category_encoders as ce

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('default')

In [None]:
import sys, os
sys.path.append(os.path.join(os.path.abspath(''), '..', 'shared_libs'))
import data_transform

In [None]:
df = pd.read_csv('data/data_target_cleared.csv')
df.head()

In [None]:
df = data_transform.clear_data_base_line(
    df, 
    '../shared_libs/data/default_values.pkl',
    can_drop_rows=True, 
    force_rebuild_cached_data=True
)

## Кэш-словарь с геоданными о городах США
Данные получены через сервис OpenStreetMap

Ключем словаря служит пара \<state\>, \<city\> (название города приводится к нижнему регистру)

По каждому городу сохраняется следующая информация:
* *type* - тип населенного пункта
* *importance* - важность (по мнению OSM)
* *boundingbox* - габаритный бокс населенного пункта
* *lat* - широта центра города
* *lng* - долгота центра города

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent='myapplication')

In [None]:
df['city'] = df['city'].apply(lambda x: str.lower(x))

In [1]:
cities_dict = {}

In [None]:
cities_list = df.groupby(['state', 'city'])['target'].mean().index

for city_item in cities_list:
    key = data_transform.get_city_dict_key(city_item[0], city_item[1])
    if key not in cities_dict:
        try:
            location = geolocator.geocode(key)
            cities_dict[key] = {
                'type': location.raw['type'],
                'importance': location.raw['importance'],
                'boundingbox': location.raw['boundingbox'],
                'lat': location.raw['lat'],
                'lng': location.raw['lon']
            }
        except Exception as ex:
            print(city_item[0] + ', {' + city_item[1] + '}', ex)
            
with open('../shared_libs/data/cities_dict.pkl', 'wb') as f:
    pickle.dump(cities_dict, f)

Для всех городов, которые не были найдены OSM по почтовому индексу были уточнены названия населенного пункта.

Ниже приведены списки замен для ошибочных названий штатов и городов.

In [None]:
states_replace = [
    ['Fl', 'FL'],
    ['BA', 'FL']
]

cities_replaces = [
    ['cherryhillsvillage', 'Cherry Hills Village'],
    ['commercecity', 'Commerce City'],
    ['federalheights', 'Federal Heights'],
    ['bonita spgs', 'Bonita Springs'],
    ['doctor philips', 'Orlando'],
    ['ldhl', 'Lauderhill'],
    ['p c beach', 'Panama City Beach'],
    ['un-incorporated broward county', 'Fort Lauderdale'],
    ['unincorporated broward county', 'Fort Lauderdale'],
    ['atlaanta', 'Atlanta'],
    ['saranac vlg', 'Saranac'],
    ['uninc', 'Charlotte'],
    ['west ashville', 'Ashville'],
    ['city center', 'Las Vegas'],
    ['bellerose manor', 'Queens Village'],
    ['bellerose vlg', 'Bellerose Village'],
    ['jamaica est', 'Jamaica'],
    ['old mill basin', 'Brooklyn'],
    ['downtown pgh', 'Pittsburgh'],
    ['outside area (outside ca)', 'Nashville'],
    ['unicorp/memphis', 'Memphis'],
    ['botines', 'Laredo'],
    ['brookside vl', 'Brookside Village'],
    ['bville', 'Brownsville'],
    ['clear lk shrs', 'Clear Lake Shores'],
    ['hollywood pa', 'Hollywood Park'],
    ['la moca', 'Laredo'],
    ['longvi', 'Longview'],
    ['mc allen', 'Mcallen'],
    ['mc gregor', 'Mcgregor'],
    ['mc kinney', 'Mckinney'],
    ['romayor', 'Cleveland'],
    ['s.a.', 'San Antonio'],
    ['tarkington prairie', 'Cleveland'],
    ['belllingham', 'Bellingham'],
    ['china spring', np.NaN],
    ['other city - in the state of florida', np.NaN],
    ['other city not in the state of florida', np.NaN],
    ['other city value - out of area', np.NaN],
    ['other city value out of area', np.NaN],
    ['unincorporated dade county', np.NaN],
    ['foreign country', np.NaN],
    ['other', np.NaN],
    [' ', np.NaN],
    ['--', np.NaN]
]

In [None]:
for s_repl in states_replace:
    mask = (df['state'] == s_repl[0])
    df.loc[mask, 'state'] = s_repl[1]

In [None]:
for c_repl in cities_replaces:
    mask = (df['city'] == c_repl[0])
    df.loc[mask, 'city'] = np.NaN if pd.isna(c_repl[1]) else str.lower(c_repl[1])

In [None]:
df = df.dropna(subset=['city'], axis=0)

По данным спискам производится корректировка названий штатов и городов. Не найденные объекты удаляются из датасета.

Производится уточняющий поиск геоданных по тсправленным городам.

In [None]:
cities_list = df.groupby(['state', 'city'])['target'].mean().index

for city_item in cities_list:
    key = data_transform.get_city_dict_key(city_item[0], city_item[1])
    if key not in cities_dict:
        try:
            location = geolocator.geocode(key)
            cities_dict[key] = {
                'type': location.raw['type'],
                'importance': location.raw['importance'],
                'boundingbox': location.raw['boundingbox'],
                'lat': location.raw['lat'],
                'lng': location.raw['lon']
            }
        except Exception as ex:
            print(city_item[0] + ', {' + city_item[1] + '}', ex)
            
with open('../shared_libs/data/cities_dict.pkl', 'wb') as f:
    pickle.dump(cities_dict, f)

## Кэш-словарь с геоданными об объектам недвижимости
Данные получены через сервисы OpenStreetMap, GoogleMap и Census

Ключем словаря служит пара \<state\>, \<city\>, \<street\> (название города приводится к нижнему регистру)

По каждому городу сохраняется следующая информация:
* *lat* - широта объекта
* *lng* - долгота объекта

In [2]:
address_dict = {}
not_founded_adresses = set([])

In [None]:
# by OSM

total_count = 0
found_count = 0
error_count = 0

max_iters_count = 250000

for index, rec in df.iterrows():
    address = data_transform.get_address_dict_key(rec['state'], rec['city'], rec['street'])
    
    if (address not in address_dict) and (address not in not_founded_adresses):
        total_count += 1
        
        loc_rec = data_transform.get_address_location_info_by_osm(address, print_error=True)
        
        if loc_rec is not None:
            address_dict[address] = loc_rec
            found_count += 1
            
            if found_count % 50 == 0:
                print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
                
                with open('../shared_libs/data/address_dict.pkl', 'wb') as f:
                    pickle.dump(address_dict, f)
        else:
            error_count += 1
            not_founded_adresses.add(address)
            
            print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
            
    if total_count >= max_iters_count:
        break

with open('../shared_libs/data/address_dict.pkl', 'wb') as f:
    pickle.dump(address_dict, f)

print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
print(list(not_founded_adresses))

In [None]:
# by Google Maps API

gmaps_api_key = 'GMap API key'

total_count = 0
found_count = 0
error_count = 0

max_iters_count = 250000

for index, rec in df.iterrows():
    address = data_transform.get_address_dict_key(rec['state'], rec['city'], rec['street'])
    
    if (address not in address_dict) and (address not in not_founded_adresses):
        total_count += 1
        
        loc_rec = data_transform.get_address_location_info(address, gmaps_api_key, print_error=True)
        
        if loc_rec is not None:
            address_dict[address] = loc_rec
            found_count += 1
            
            if found_count % 50 == 0:
                print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
                
                with open('../shared_libs/data/address_dict.pkl', 'wb') as f:
                    pickle.dump(address_dict, f)                
        else:
            error_count += 1
            not_founded_adresses.add(address)
            
            print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
            
    if total_count >= max_iters_count:
        break

with open('../shared_libs/data/address_dict.pkl', 'wb') as f:
    pickle.dump(address_dict, f)

print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
print(list(not_founded_adresses))

## Кэш-словарь с геоданными по почтовому индексу
Данные получены через сервисы OpenStreetMap и Census

Для объектов по которым не были найдены геоданные (по какой либо причине) производится поиск цента района (используется почтовый индекс).

Таким образом если объект присутствует в кэш-словаре адресов - то берутся координаты из этого словаря иначе берутся геоданные из словаря по почтовому индексу.

Ключем словаря служит пара \<state\>, \<city\>, \<zipcode\> (название города приводится к нижнему регистру)

По каждому городу сохраняется следующая информация:
* *lat* - широта объекта
* *lng* - долгота объекта

In [3]:
address_by_zip_dict = {}

In [None]:
# by OSM (zipcode)

total_count = 0
found_count = 0
error_count = 0

max_iters_count = 250000

for index, rec in df.iterrows():
    address = data_transform.get_address_dict_key(rec['state'], rec['city'], rec['street'])
    address_zip = data_transform.get_address_zip_dict_key(rec['state'], rec['city'], rec['zipcode'])
    
    if (address not in address_dict) and (address_zip not in address_by_zip_dict):
        total_count += 1
        
        loc_rec = data_transform.get_address_location_info_by_osm(rec['state']+', '+rec['city']+', '+rec['zipcode'], print_error=True)
        
        if loc_rec is not None:
            address_by_zip_dict[address_zip] = loc_rec
            found_count += 1
            
            if found_count % 50 == 0:
                print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
                
                with open('../shared_libs/data/address_by_zip_dict.pkl', 'wb') as f:
                    pickle.dump(address_by_zip_dict, f)                
        else:
            error_count += 1
            not_founded_adresses.add(address)
            
            print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
            
    if total_count >= max_iters_count:
        break

with open('../shared_libs/data/address_by_zip_dict.pkl', 'wb') as f:
    pickle.dump(address_by_zip_dict, f)

print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
print(list(not_founded_adresses))

In [None]:
all_zipcodes_index = df.groupby(['state', 'city', 'zipcode']).agg({'street': 'count'}).sort_values(by='street', ascending=False).index

In [None]:
# by US Census (zipcode)

total_count = 0
found_count = 0
error_count = 0

for index in all_zipcodes_index:
    address_zip = data_transform.get_address_zip_dict_key(index[0], index[1], index[2])
    
    if address_zip not in address_by_zip_dict:
        total_count += 1
        
        loc_rec = data_transform.get_zipcode_location_info_by_us_census(index[0], index[1], index[2], print_error=True)
        
        if loc_rec is not None:
            address_by_zip_dict[address_zip] = loc_rec
            found_count += 1
            
            if found_count % 50 == 0:
                print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
                
                with open('../shared_libs/data/address_by_zip_dict.pkl', 'wb') as f:
                    pickle.dump(address_by_zip_dict, f)                
        else:
            error_count += 1
            not_founded_adresses.add(address_zip)
            
            print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
            
with open('../shared_libs/data/address_by_zip_dict.pkl', 'wb') as f:
    pickle.dump(address_by_zip_dict, f)

print('Processed:', total_count, 'Success:', found_count/total_count*100, 'Error:', error_count/total_count*100)
print(list(not_founded_adresses))

## Формирование словаря кластеров недвижимости по цене объектов

Для каждого города формируется два кластера (наболее дорогоая недвижимость и ниболее дешевая).

По каждому городу берется верхний и нижний перцентиль (20%) и для этих групп берется медиана широты и долготы. Эти координаты и принимаются за центр кластера, относительно которого и вычисляется нормированное расстояние до объекта недвижимости.

In [None]:
cities_clusters_dict = {}
all_cities = df.groupby(['state', 'city']).agg({'city': 'count'}).rename({'city': 'city_count'}, axis=1).sort_values(by=['city_count'], ascending=False).index

In [None]:
PERCENTILE_SIZE = 20

for city_indx in all_cities:
    state = city_indx[0]
    city = city_indx[1]
    
    city_key = data_transform.get_city_dict_key(state, city)
    cities_clusters_dict[city_key] = data_transform.get_subset_mean_location(df, state, city, PERCENTILE_SIZE, cities_dict, address_dict, address_by_zip_dict)

with open('../shared_libs/data/cities_clusters_dict.pkl', 'wb') as f:
    pickle.dump(cities_clusters_dict, f)