In [17]:
api_key = '91ac92fd-6089-4694-80ab-63b6086f3a38'

from decimal import Decimal
import pandas as pd
import numpy as np
from yandex_geocoder import Client, NothingFound
import json 
import spacy

In [2]:
coords = pd.read_csv('coords.csv', index_col=0)

In [74]:
coords

Unnamed: 0,Trip Name,Destination City,x,y,temp_addr
0,Turkey Antalya LED,,30.701659,36.885843,
1,Turkey Antalya LED,,30.701659,36.885843,
2,Turkey Antalya MOW,Боазкент,35.302075,39.056249,
3,Turkey Antalya MOW,Текирова,30.525903,36.501248,
4,Turkey Antalya MOW,Текирова,30.525903,36.501248,
...,...,...,...,...,...
209604,Turkey Antalya LED,,30.701659,36.885843,
209605,Turkey Antalya MOW,Лара,32.466821,37.883113,
209606,Turkey Antalya MOW,,30.701659,36.885843,
209607,TUI Premium Antalya MOW,Белек Центр,31.063114,36.864517,


In [72]:
_df = coords[coords['x'].isna() & coords['Destination City'].isna()]

In [73]:
_df

Unnamed: 0,Trip Name,Destination City,x,y,temp_addr
27,Байкал экскурсионные туры MOW-UUD,,,,
30,Абхазия MOW GDS,,,,
50,Россия Сочи LED GDS,,,,
51,Россия Сочи LED GDS,,,,
52,Байкал экскурсионные туры MOW-UUD,,,,
...,...,...,...,...,...
209584,Россия Сочи MOW,,,,
209588,UAE Dubai MOW DXB EK [БЛОКИ МЕСТ] part 3,,,,
209595,Россия Шерегеш SKI GDS MOW,,,,
209597,Infotour Байкал,,,,


In [71]:
_df = _df.drop_duplicates()

In [20]:
import re

def hand_clearing(s):
    s = re.sub(r" ?\([^)]+\)", '', s)
    s = re.sub(r" ?\[[^)]+\]", '', s)
    s = re.sub(r"-", ' ', s)
    s = re.sub(r"\+", ' ', s)
    s = re.sub(r"/", '', s)
    s = re.sub(r"[\d-]", '', s)
    s = re.sub(r"дней", '', s)
    s = re.sub(r"дня", '', s)
    s = re.sub(r'TOUR', '', s)
    s = re.sub(r'tour', '', s)
    s = re.sub(r'part', '', s)
    s = re.sub(r'OLD', '', s)
    s = re.sub(r'Fantasy', '', s)
    s = re.sub(r'Infotour', '', s)
    s = ' '.join(word for word in s.split() if len(word)>3)
    return s

In [70]:
_df

Unnamed: 0,Trip Name,Destination City,x,y,temp_addr
27,Байкал экскурсионные туры MOW-UUD,,,,
30,Абхазия MOW GDS,,,,
50,Россия Сочи LED GDS,,,,
82,UAE Dubai MOW DXB/RKT Ural Airlines [БЛОКИ МЕС...,,,,
91,UAE Dubai GDS MOW DXB part 3,,,,
...,...,...,...,...,...
208946,"Нетленная классика, 2 дня",,,,
209084,Россия Казань LED GDS,,,,
209414,Россия Экскурсионные туры (без перелета),,,,
209446,GR PR and Press BAIKAL,,,,


In [67]:
# _df['Trip Name clear'] = _df['Trip Name'].map(remove_trash)
# _df = _df.set_index('Trip Name', drop=True)
# main_d = _df[['x', 'y']].T.to_dict()

with open('TRAVELNAME_COORDS.json', 'w') as f:
    json.dump(main_d, f)

# with open('TRAVELNAME_COORDS.json', 'r') as f:
#     main_d = json.loads(f.read())

In [6]:
from decimal import Decimal
from typing import Tuple

import requests

class YandexGeocoderException(Exception):
    pass


class UnexpectedResponse(YandexGeocoderException):
    pass


class NothingFound(YandexGeocoderException):
    pass


class InvalidKey(YandexGeocoderException):
    pass


class Client:
    """Yandex geocoder API client.
    :Example:
        >>> from yandex_geocoder import Client
        >>> client = Client("your-api-key")
        >>> coordinates = client.coordinates("Москва Льва Толстого 16")
        >>> assert coordinates == (Decimal("37.587093"), Decimal("55.733969"))
        >>> address = client.address(Decimal("37.587093"), Decimal("55.733969"))
        >>> assert address == "Россия, Москва, улица Льва Толстого, 16"
    """

    __slots__ = ("api_key",)

    api_key: str

    def __init__(self, api_key: str):
        self.api_key = api_key

    def _request(self, address: str) -> dict:
        response = requests.get(
            "https://geocode-maps.yandex.ru/1.x/",
            params=dict(format="json", apikey=self.api_key, geocode=address),
        )

        if response.status_code == 200:
            return response.json()["response"]
        elif response.status_code == 403:
            raise InvalidKey()
        else:
            raise UnexpectedResponse(
                f"status_code={response.status_code}, body={response.content}"
            )

    def coordinates(self, address: str) -> Tuple[Decimal]:
        """Fetch coordinates (longitude, latitude) for passed address."""
        data = self._request(address)["GeoObjectCollection"]["featureMember"]

        if not data:
            raise NothingFound(f'Nothing found for "{address}" not found')

        coordinates = data[0]["GeoObject"]["Point"]["pos"]  # type: str
        longitude, latitude = tuple(coordinates.split(" "))
        
        # print(data)
        
        return float(longitude), float(latitude)

In [7]:
import geonamescache

gc = geonamescache.GeonamesCache()

countries = gc.get_countries()
cities = gc.get_cities()

In [31]:
nlp = spacy.load("en_core_web_md")

def nlp_clearing(s):
    doc = nlp(s)
    return ' '. join(x.text for x in doc.ents)

In [22]:
client = Client(api_key)

for s, data in main_d.items():
    if np.isnan(data['x']):
        try:
            if nlp_clearing(s) == '':
                s_clear = hand_clearing(s)
            else:
                s_clear = nlp_clearing(s)
                
            coordinates = client.coordinates(s)
            main_d[s] = dict(x=coordinates[0], y=coordinates[1])
            print('ok')
        except NothingFound:
            print(s, ': not ok')
    else:
        print('pass')

pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
pass
ok
ok
ok
UAE Ras-Al-Khaimah GDS MOW RKT part 3 : not ok
ok
ok
Продажа доп. услуг отдельно : not ok
Байкал экскурсионные туры SVX-UUD : not ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
UAE Dubai MOW DXB/RKT Ural Airlines [БЛОКИ МЕСТ] : not ok
ok
ok
ok
UAE Dubai GDS GRV DXB part 3 : not ok
ok
ok
UAE Dubai MOW DXB EK [БЛОКИ МЕСТ] part 3 : not ok
Калининград отели на побережье MOW GDS. : not ok
UAE Sharjah GDS MOW SHJ part 3 : not ok
ok
ok
ok
ok
UAE Dubai MOW DXB EK [БЛОКИ МЕСТ] : not ok
ok
Cyprus S21 (hotel only) RU : not ok
ok
ok
Домбай-Приэльбрусье GDS MOW : not ok
ok
SL TOUR Burgas/Varna MOW : not ok
ok
ok
Классическая Италия MOW : not ok
ok
ok
ok
Greece Rhodes S18 MOW part 2 : not ok
ok
ok
ok
SL TOUR Larnaca MOW : not ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
Greece Rhodes S18 MOW : not ok
Калабрия Тирренское побережье S18 MOW old : not ok
Cyprus S20 (hotel only) RU : not ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
Римские каникулы MOW : not ok
UAE Dubai DWC U

Большое золотое кольцо России 7 дней (Золотое кольцо России) : not ok
Italy Garda VRN-LED GDS : not ok
Солнечная Португалия MOW GDS (прямой) : not ok
TUI Premium Dalaman LED : not ok
ok
Англия-Шотландия-Уэльс MOW GDS : not ok
ok
Замки Беларуси 2 дня (Беларусь без перелета) : not ok
я_OLDCzech Republic- Prague (hotel only) RU : not ok
ok
ok
ok
UAE Dubai MOW GDS EK Dubai Parks + пляжный отдых : not ok
ok
TUI Premium UAE Dubai GDS SVX DXB : not ok
ok
Калининград экскурсионные туры GDS LED : not ok
ok
ok
Dynamic package: Portugal GDS : not ok
Сокровища Тосканы MOW : not ok
SL TOUR Bodrum MOW : not ok
Романтический Новый год в Италии из Вероны MOW : not ok
Капучино и Шоколад из Вероны MOW : not ok
ok
ok
ok
Италия + Австрия + Германия MOW : not ok
Римские каникулы + Римини MOW : not ok
ok
Париж и Замки Луары MOW GDS : not ok
Классическая Италия из Рима LED GDS : not ok
Из Италии с любовью из Вероны MOW GDS : not ok
Япония: Экскурсионные туры MOW GDS : not ok
ok
ok
Cyprus LED GDS WINTER 18/19

In [56]:
main_d = {k: {'x': float(v['x']), 'y': float(v['y'])} for k, v in main_d.items()}

In [57]:
_d = {k: v for k, v in main_d.items() if np.isnan(v['x'])}

len(_d)

298

### Руками

In [63]:
for k in main_d:
    if not np.isnan(main_d[k]['x']):
        continue
    elif 'Dubai' in k:
        s = 'Dubai'
    elif 'Калиниград' in k:
        s = 'Калиниград'
    elif 'Austria' in k:
        s = 'Austria'
    elif 'Сочи' in k:
        s = 'Сочи'
    else:
        continue
    
    coordinates = client.coordinates('Dubai')
    main_d[k] = {'x': float(coordinates[0]), 'y': float(coordinates[1])}

In [64]:
_ = coords[coords['Trip Name'].isin(_d) & coords['Destination City'].isna()]

In [65]:
_.groupby('Trip Name').agg('count')

Unnamed: 0_level_0,Destination City,x,y,temp_addr
Trip Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria W1819 (hotel only) RU,0,0,0,0
Combitour Albania MOW (TIA-TGD),0,0,0,0
Combitour Barcelona + Dorada Hotels S19 MOW,0,0,0,0
Combitour Jordan Dead Sea + Aqaba/Tala Bay MOW-AQJ,0,0,0,0
Combitour Port Aventura + Best Hotels S19 MOW,0,0,0,0
...,...,...,...,...
я_OLDПрага Классическая MOW GDS,0,0,0,0
я_OLDПрага Саксония MOW GDS,0,0,0,0
я_OLDПрага Эконом MOW GDS,0,0,0,0
я_OLDПрага Южная Чехия MOW GDS,0,0,0,0


In [66]:
_

Unnamed: 0,Trip Name,Destination City,x,y,temp_addr
1950,Cyprus S21 (hotel only) RU,,,,
2143,Домбай-Приэльбрусье GDS MOW,,,,
2144,Домбай-Приэльбрусье GDS MOW,,,,
2166,SL TOUR Burgas/Varna MOW,,,,
2182,Классическая Италия MOW,,,,
...,...,...,...,...,...
209272,UAE Hotel Only part 3,,,,
209294,UAE Hotel Only part 3,,,,
209297,Maldives MOW GDS (Аэрофлот) воскресенье,,,,
209393,TUI Premium Россия Сочи MOW,,,,


In [44]:
import time

for row in coords[coords['x'].isna()].iterrows():
    index = row[0]
    trip_name = row[1]['Trip Name']
    dest_city = row[1]['Destination City']

    if not type(dest_city) == str:
        continue

    print(dest_city)
    
    if type(dest_city) == str and type coords.loc[index]['x']):
        try:
            coordinates = client.coordinates(dest_city)
            coords.loc[coords['Destination City'] == dest_city, 'x'] = coordinates[0]
            coords.loc[coords['Destination City'] == dest_city, 'y'] = coordinates[1]
            print(coords.loc[coords['Destination City'] == dest_city])
            print('ok')
        except NothingFound as e:
            print(e)
            continue

Пляжные отели Фуджейры
Nothing found for "Пляжные отели Фуджейры" not found
Пляжные отели Фуджейры
Nothing found for "Пляжные отели Фуджейры" not found
Дубай Марина
                                   Trip Name Destination City          x  \
136                           UAE Hotel Only     Дубай Марина  55.147575   
137                           UAE Hotel Only     Дубай Марина  55.147575   
3713     UAE Dubai MOW DWC/RKT Ural Airlines     Дубай Марина  55.147575   
3714     UAE Dubai MOW DWC/RKT Ural Airlines     Дубай Марина  55.147575   
3725     UAE Dubai MOW DWC/RKT Ural Airlines     Дубай Марина  55.147575   
...                                      ...              ...        ...   
206583   UAE Dubai MOW DWC/RKT Ural Airlines     Дубай Марина  55.147575   
206736                 UAE Sharjah MOW 18/19     Дубай Марина  55.147575   
207285     UAE Dubai MOW DXB EK [БЛОКИ МЕСТ]     Дубай Марина  55.147575   
209267               UAE Sharjah GDS MOW SHJ     Дубай Марина  55.147575   

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [50]:
coords[~coords['x'].isna()]['x'][0]

float

In [20]:
coords.loc[27]

Trip Name           Байкал экскурсионные туры MOW-UUD
Destination City                                  NaN
x                                                 NaN
y                                                 NaN
temp_addr                                         NaN
Name: 27, dtype: object

In [22]:
coords.loc[coords['Destination City'] == dest_city, 'x']

58        NaN
59        NaN
2007      NaN
3783      NaN
4642      NaN
         ... 
207004    NaN
207056    NaN
207109    NaN
209465    NaN
209582    NaN
Name: x, Length: 378, dtype: object

In [23]:
dest_city

'Пляжные отели Фуджейры'

In [54]:
coords.dropna(how='any')

Unnamed: 0,Trip Name,Destination City,x,y,temp_addr


In [68]:
_df = coords[~coords['Destination City'].isna()].drop_duplicates()

In [100]:
d = _df.set_index('Destination City').drop(columns='Trip Name').fillna(0).T.to_dict()

  """Entry point for launching an IPython kernel.


In [72]:
d = d_0_clear
d

{'Dubai part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград отели побережье GDS.': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Sharjah part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Cyprus': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Домбай Приэльбрусье': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Классическая Италия Рима': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Greece Heraklion part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Montenegro': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Прага Эконом': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград отели побережье': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Санкт Петербург легче легкого!': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Незабываемые берегах Невы, CANCEL': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Premium Hotel Only': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград экскурсионные туры': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Weekend Израиле': {'x': 0.0, 'y': 0.0, 'hotel_only': Tru

In [73]:
for s, data in d.items():
    if data['x'] == 0:
        try:
            coordinates = client.coordinates(s)
            d[s] = dict(x=coordinates[0], y=coordinates[1])
            print('ok')
        except NothingFound:
            print(s, ': not ok')
    else:
        print('pass')

InvalidKey: 

In [16]:
len(d)

930

In [9]:
d_0 = {k: v for k, v in d.items() if v['x'] == 0}
print(len(d_0))

138


In [11]:
hotel_only = ['без перелета', 'без авиаперелета', 'hotel only', 'Hotel Only', 'только отель', 'отдельно', 'GDS']
i = 0

for k in d_0:
    for t in hotel_only:
        if t in k:
            d_0[k]['hotel_only'] = True
            i += 1
    if 'GDS' in k:
        d_0[k]['GDK'] = True

In [66]:
import re

def remove_trash(s):
    s = re.sub(r" ?\([^)]+\)", '', s)
    s = re.sub(r" ?\[[^)]+\]", '', s)
    s = re.sub(r"-", ' ', s)
    s = re.sub(r"\+", ' ', s)
    s = re.sub(r"/", '', s)
    s = re.sub(r"[\d-]", '', s)
    s = re.sub(r"дней", '', s)
    s = re.sub(r"дня", '', s)
    s = re.sub(r'TOUR', '', s)
    s = ' '.join(word for word in s.split() if len(word)>3)
    return s

In [67]:
remove_trash('Infotour Greece SKG+CFU S18 MOW')

'Infotour Greece'

In [68]:
d_0_clear = {remove_trash(k): v for k, v in d_0.items()}

In [69]:
d_0_clear

{'Dubai part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград отели побережье GDS.': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Sharjah part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Cyprus': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Домбай Приэльбрусье': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Классическая Италия Рима': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Greece Heraklion part': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Montenegro': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Прага Эконом': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград отели побережье': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Санкт Петербург легче легкого!': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Незабываемые берегах Невы, CANCEL': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Premium Hotel Only': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград экскурсионные туры': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Weekend Израиле': {'x': 0.0, 'y': 0.0, 'hotel_only': Tru

In [6]:
d_0_ho = d_0 = {k: v for k, v in d_0_clear.items() if v.get('hotel_only') == True}
d_0_not_ho = {k: v for k, v in d_0_clear.items() if v.get('hotel_only') != True}

NameError: name 'd_0_clear' is not defined

In [16]:
d

{'Dubai part': {'x': 0.0, 'y': 0.0},
 'Калининград отели побережье GDS.': {'x': 0.0,
  'y': 0.0,
  'hotel_only': True,
  'GDK': True},
 'Sharjah part': {'x': 0.0, 'y': 0.0},
 'Cyprus': {'x': 0.0, 'y': 0.0},
 'Домбай Приэльбрусье': {'x': 0.0, 'y': 0.0},
 'Классическая Италия Рима': {'x': 0.0, 'y': 0.0},
 'Greece Heraklion part': {'x': 0.0, 'y': 0.0},
 'Montenegro': {'x': 0.0, 'y': 0.0},
 'Прага Эконом': {'x': 0.0, 'y': 0.0},
 'Калининград отели побережье': {'x': 0.0, 'y': 0.0},
 'Санкт Петербург легче легкого!': {'x': 0.0, 'y': 0.0},
 'Незабываемые берегах Невы, CANCEL': {'x': 0.0, 'y': 0.0},
 'Premium Hotel Only': {'x': 0.0, 'y': 0.0, 'hotel_only': True},
 'Калининград экскурсионные туры': {'x': 0.0, 'y': 0.0},
 'Weekend Израиле': {'x': 0.0, 'y': 0.0},
 'Spain Costa': {'x': 0.0, 'y': 0.0},
 'отели Финляндии Куусамо': {'x': 0.0, 'y': 0.0},
 'Israel DEAD': {'x': 0.0, 'y': 0.0},
 'Italy Verona': {'x': 0.0, 'y': 0.0},
 'Деду морозу Великий Устюг LIGHT': {'x': 0.0, 'y': 0.0},
 'Калининград 

In [22]:
air = pd.read_csv('airport_code.csv')

In [27]:
air[air['IATA airport code'] == 'КМВ']

Unnamed: 0.1,Unnamed: 0,City,Country,IATA airport code


In [76]:
dd = {k: {'x': float(v['x']), 'y': float(v['y'])} for k, v in d.items()}

In [26]:
dd

NameError: name 'dd' is not defined

In [8]:
with open('TRAVELNAME_COORDS.json', 'r') as f:
    d = json.loads(f.read())

In [9]:
d

{'Turkey Antalya LED': {'x': 30.701659, 'y': 36.885843},
 'Turkey Antalya MOW': {'x': 30.701659, 'y': 36.885843},
 'Байкал экскурсионные туры MOW-UUD': {'x': 0.0, 'y': 0.0},
 'Абхазия MOW GDS': {'x': 0.0, 'y': 0.0},
 'TUI Premium Antalya MOW': {'x': 30.701659, 'y': 36.885843},
 'Turkey Antalya KZN': {'x': 30.701659, 'y': 36.885843},
 'Россия Сочи LED GDS': {'x': 0.0, 'y': 0.0},
 'Turkey Antalya KUF': {'x': 30.701659, 'y': 36.885843},
 'TUI Premium Turkey (hotel only) RU Antalya': {'x': 30.701659,
  'y': 36.885843},
 'UAE Dubai MOW DXB/RKT Ural Airlines [БЛОКИ МЕСТ] part 3': {'x': 0.0,
  'y': 0.0},
 'TUI Premium Antalya LED': {'x': 30.701659, 'y': 36.885843},
 'UAE Dubai GDS MOW DXB part 3': {'x': 0.0, 'y': 0.0},
 'Россия (только отель)': {'x': 0.0, 'y': 0.0},
 'Maldives (hotel+transfer) RU': {'x': 0.0, 'y': 0.0},
 'Россия Анапа SVX': {'x': 0.0, 'y': 0.0},
 'Maldives MOW GDS': {'x': 0.0, 'y': 0.0},
 'Россия Шерегеш SKI GDS MOW': {'x': 0.0, 'y': 0.0},
 'TUI Premium Antalya KZN': {'x': 30

In [17]:
dir()

['Client',
 'Decimal',
 'In',
 'NothingFound',
 'Out',
 '_',
 '_13',
 '_14',
 '_16',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'api_key',
 'client',
 'd',
 'd_0',
 'exit',
 'f',
 'get_ipython',
 'hotel_only',
 'i',
 'json',
 'k',
 'np',
 'pd',
 'quit',
 't']

'GDS'

In [132]:
_df = coords[~coords['Trip Name'].isna() & coords['Destination City'].isna()].drop_duplicates()[['Trip Name', 'x', 'y']].fillna(0)