In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import folium
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Yandex
from geopy.distance import vincenty
from time import time
from tqdm import tqdm

In [2]:
train = pd.read_csv('data/train_set.csv')
test = pd.read_csv('data/test_set.csv')

train.rename(columns={ 'pos_adress_lat' : 'pos_address_lat', 
                       'pos_adress_lon' : 'pos_address_lon'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Удалить транзакции, где нет адреса координат atm, pos и terminal_id (в трейне это все строчки, где нет terminal_id)
train.drop(train[train['terminal_id'].isnull() == True].index, axis=0, inplace=True)

In [4]:
# Подсчет частот адресов
atm_address_count = train['atm_address'].value_counts()

# Удаление транзакций с atm_address = "RADNAYA\             RUS,445" (по нему координат нет)
# train.drop(train[train['atm_address'] == atm_address_count.index[45]].index, axis=0, inplace=True)

# Замена 'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)' на 'Подольск, ул. Полевановская, д. 9'
train['atm_address'].replace(to_replace=atm_address_count.index[46], value='Подольск, ул. Полевановская, д. 9', inplace=True)

In [5]:
# Адреса с '\' в названии (частоты)
backslash_atm_address_count = train['atm_address'][train['atm_address'].str.contains("\\\\") == True].value_counts()

# Замены до частоты 100 более-менее читаемых адресов
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[1], value='улица Маросейка, 3/13с1, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[9], value='улица Савушкина, 141, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[12], value='Новомытищинский проспект, 34, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[14], value='проспект Большевиков, 18, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[15], value='Марксистская улица, 1, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[16], value='Гражданский проспект, 41к2, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[18], value='улица Грекова, 8, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[23], value='Ракетный бульвар, 16, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[27], value='Новочеркасский проспект, 43/17, Санкт-Петербург', inplace=True)

# Остальные удалить
train.drop(train[train['atm_address'].str.contains("\\\\") == True].index, inplace=True)

In [6]:
# Для удобства анализа сделаю labelencoding на customer_id и terminal_id
label_encoder = LabelEncoder()
train['customer_id'] = label_encoder.fit_transform(train['customer_id'])
train['terminal_id'] = label_encoder.fit_transform(train['terminal_id'])

In [7]:
# Обнуление NaN адресов atm и pos
train[['atm_address', 'atm_address_lat', 'atm_address_lon',
       'pos_address', 'pos_address_lat', 'pos_address_lon']] = train[['atm_address', 'atm_address_lat', 'atm_address_lon', 
                                                                      'pos_address', 'pos_address_lat', 'pos_address_lon']].fillna(0)


In [8]:
# у 13 pos адресов координаты разные - исправляем
wrong_pos_address = []
with open('pos_address.txt', 'r') as file:
    for line in file:
        wrong_pos_address.append(line.strip())
        
geolocator = Yandex()

#0
location = geolocator.geocode('B. KAMENSCHIKI, 4MOSCOW')
#pos = train[train['pos_address'] == wrong_pos_address[0]]
#pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})

train['pos_address_lat'][train['pos_address'] == wrong_pos_address[0]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[0]] = location.longitude

#1
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[1]] = 59.2243102
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[1]] = 39.8657153

#2 под вопросом, потому что обе точки имеют место быть
#train['pos_address_lat'][train['pos_address'] == wrong_pos_address[2]] = 55.7955715
#train['pos_address_lon'][train['pos_address'] == wrong_pos_address[2]] = 37.5937033

#3 
# тоже обе точки могут быть

#4
location = geolocator.geocode("5-1A, SUSCHEVSKIY VALMOSKVA")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[4]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[4]] = location.longitude

#5
location = geolocator.geocode("'KOMMUNISTICHESKAYA STR 1MYTISCHY")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[5]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[5]] = location.longitude

#6
location = geolocator.geocode("MIRA STR 51MYTISCHY")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[6]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[6]] = location.longitude

#7
location = geolocator.geocode("42, SCHUKINSKAYA MOSKVA")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[7]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[7]] = location.longitude

#8
#нормальная, отличие в 8 незначящем знаке

#9
location = geolocator.geocode("2, VETERANOV MYTISCHI")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[9]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[9]] = location.longitude

#10
location = geolocator.geocode("55 BOLSHOY SAMPSONIEVSKIY PR SANKT-PETERBURG")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[10]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[10]] = location.longitude

#11
location = geolocator.geocode("20 BOLSHOY SAMPSONIEVSKIY PRSANKT-PETERBURG")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[11]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[11]] = location.longitude

#12
location = geolocator.geocode("Счастливая улица, 11, Усады")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[12]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[12]] = location.longitude

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is 

### Посмотрим, где нет ни atm ни pos

In [166]:
# Замена нулями координат atm и pos
train[['atm_address_lat', 'atm_address_lon', 'pos_address_lat', 'pos_address_lon']] = train[['atm_address_lat', 'atm_address_lon', 'pos_address_lat', 'pos_address_lon']].fillna(0)

In [167]:
no_atm_pos = train[(train['atm_address_lat'] == 0) & (train['pos_address_lat'] == 0)]
no_atm_pos.shape[0]

97411

In [168]:
no_atm_pos[:10]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
302067,3.836768,,0.0,0.0,"""Caf&#233","-bar Campus""",,4926,55.901,37.401,7000,,0.0,0.0,172692,,55.533,37.618
555784,2.366405,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,235,,0.0,0.0,172692,,,
555788,2.243972,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,169,,0.0,0.0,172692,,,
555795,2.487648,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,304,,0.0,0.0,172692,,,
555797,2.260354,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,183,,0.0,0.0,172692,,,
555805,2.287494,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,200,,0.0,0.0,172692,,,
555808,2.225642,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,168,,0.0,0.0,172692,,,
555816,2.364249,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,227,,0.0,0.0,172692,,,
555821,2.49655,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,324,,0.0,0.0,172692,,,
555824,2.330614,,0.0,0.0,"""OSN.PR","ZZ""",,1810,55.639,37.611,217,,0.0,0.0,172692,,,


In [175]:
len(no_atm_pos['terminal_id'].unique())

17648

In [179]:
no_atm_pos[no_atm_pos['terminal_id'] == 172692].shape[0]

44

In [178]:
train[train['terminal_id'] == 172692].shape[0]

44

In [183]:
# Число несовпадающих количеств terminal_id
count = 0
for i in tqdm(range(len(no_atm_pos['terminal_id'].unique()))):
    if (no_atm_pos[no_atm_pos['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0] != train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0]):
        count += 1
print(count)

100%|███████████████████████████████████████████████████████████████████████████| 17648/17648 [02:34<00:00, 114.01it/s]


25


In [185]:
terminal_id = []
for i in tqdm(range(len(no_atm_pos['terminal_id'].unique()))):
    if (no_atm_pos[no_atm_pos['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0] != train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0]):
        terminal_id.append(i)

100%|███████████████████████████████████████████████████████████████████████████| 17648/17648 [02:11<00:00, 134.03it/s]


In [189]:
print(terminal_id)

[275, 742, 770, 1606, 1650, 2183, 2802, 4026, 4140, 4316, 5180, 6147, 6386, 6560, 6946, 7122, 7274, 7351, 8083, 10484, 12307, 12354, 14490, 14877, 16627]


In [190]:
no_atm_pos['terminal_id'].unique()[275]

6492

In [192]:
no_atm_pos[no_atm_pos['terminal_id'] == 6492].shape[0]

12

In [193]:
train[train['terminal_id'] == 6492].shape[0]

13

In [194]:
train[train['terminal_id'] == 6492]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
557595,2.36052,,0.0,0.0,MOSCOW,RUS,643.0,6383,54.933,83.123,4814,33 KOTELNICHESKAYA EMBMOSCOW157000 RUSRUS,55.74064,37.647094,6492,2017-09-23,54.98,83.022
948285,4.466613,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,894,54.96,83.179,6011,,0.0,0.0,6492,2017-02-10,54.945,82.85
948286,4.485451,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,894,54.96,83.179,6011,,0.0,0.0,6492,2017-09-10,54.945,82.85
948287,4.467291,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,894,54.96,83.179,6011,,0.0,0.0,6492,2017-10-09,54.945,82.85
948288,4.468131,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,894,54.96,83.179,6011,,0.0,0.0,6492,2017-07-21,54.945,82.85
948289,3.564609,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,1304,54.991,83.005,6011,,0.0,0.0,6492,2017-06-27,,
948290,3.988495,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,894,54.96,83.179,6011,,0.0,0.0,6492,2017-04-17,54.945,82.85
948291,3.659052,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,1304,54.991,83.005,6011,,0.0,0.0,6492,2017-07-24,,
948292,2.713506,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,8808,54.77,83.08,6011,,0.0,0.0,6492,2017-06-29,54.967,82.934
948293,4.355365,ROEV REVOLUCII 35 K 1\NVSIBR\630012 RUSRUS,0.0,0.0,NVSIBR,RUS,643.0,1304,54.991,83.005,6011,,0.0,0.0,6492,2017-06-25,,


In [196]:
print(no_atm_pos[no_atm_pos['terminal_id'] == no_atm_pos['terminal_id'].unique()[terminal_id[1]]].shape[0])
print(train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[terminal_id[1]]].shape[0])

9
11


In [197]:
train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[terminal_id[1]]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
82305,2.641953,,0.0,0.0,STPETE,RUS,643.0,1905,59.878,30.458,5732,3 NARODNAYA STSTPETE190000 RUSRUS,50.515609,36.609686,17469,2017-05-26,,
312623,2.322353,,0.0,0.0,STPETE,RUS,643.0,8909,59.983,30.516,5732,3 NARODNAYA STSTPETE190000 RUSRUS,50.515609,36.609686,17469,2017-04-20,59.912,30.29
956537,2.998182,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-03-29,,
956538,3.289432,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-10-18,,
956539,3.789248,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,
956540,3.769091,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,
956541,3.798301,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,
956542,3.773197,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,
956543,3.796158,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,
956544,3.788632,"ONALNAYA, 26\NIZVRT\628600 RUSRUS",0.0,0.0,NIZVRT,RUS,643.0,4275,60.934,76.613,6011,,0.0,0.0,17469,2017-08-29,,


### Когда вообще ни адресов ни координат atm, pos

In [200]:
# Замена адресов atm и pos
train[['atm_address', 'pos_address']] = train[['atm_address', 'pos_address']].fillna(0)

In [202]:
no_atm_pos = train[(train['atm_address_lat'] == 0) & (train['atm_address'] == 0) & (train['pos_address_lat'] == 0) & (train['pos_address'] == 0)]
no_atm_pos.shape[0]

10165

In [208]:
len(no_atm_pos['terminal_id'].unique())

2331

In [209]:
terminal_id_train_yes = []
terminal_id_train_no = []
for i in tqdm(range(len(no_atm_pos['terminal_id'].unique()))):
    if (no_atm_pos[no_atm_pos['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0] != train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[i]].shape[0]):
        terminal_id_train_yes.append(i)
    else:
        terminal_id_train_no.append(i)
print(len(terminal_id))

100%|█████████████████████████████████████████████████████████████████████████████| 2331/2331 [00:09<00:00, 253.12it/s]


1228


In [216]:
train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[terminal_id_train_yes[1]]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
943536,4.371419,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,270,60.002,30.196,6011,0,0.0,0.0,897,2017-02-22,60.04,30.004
943537,2.594132,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,2824,60.094,29.97,6011,0,0.0,0.0,897,2017-09-08,,
943538,3.099744,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,4219,59.988,30.281,6011,0,0.0,0.0,897,2017-07-30,,
943539,2.983098,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,1299,59.899,30.446,6011,0,0.0,0.0,897,2017-08-15,,
943540,4.067995,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,8938,59.992,29.786,6011,0,0.0,0.0,897,2017-09-10,60.035,30.002
943541,3.486681,0,0.0,0.0,Sankt-Peterbu,RU,643.0,4315,60.017,30.306,6011,0,0.0,0.0,897,2017-06-14,60.092,29.971
943542,2.697423,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,1299,59.899,30.446,6011,0,0.0,0.0,897,2017-08-07,,
943543,3.690906,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,1299,59.899,30.446,6011,0,0.0,0.0,897,2017-11-15,,
943544,3.386616,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,8571,59.947,30.412,6011,0,0.0,0.0,897,2017-10-19,,
943545,3.28883,"A, 141 > Sankt\Sankt-Peterbu\00000 ...",0.0,0.0,Sankt-Peterbu,RUS,643.0,1299,59.899,30.446,6011,0,0.0,0.0,897,2017-10-11,,


In [215]:
t1 = train[train['terminal_id'] == no_atm_pos['terminal_id'].unique()[terminal_id_train_yes[0]]]
t1['atm_address'].values

array(['OE HIGHWAY\\MOSKVA\\630004    RUSRUS',
       'OE HIGHWAY\\MOSKVA\\630004    RUSRUS', 0], dtype=object)

адреса хер пойми какие, поэтому адреса надо чистить

### Чистка адресов

In [217]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1224705 entries, 0 to 1224733
Data columns (total 18 columns):
amount              1224705 non-null float64
atm_address         1224705 non-null object
atm_address_lat     1224705 non-null float64
atm_address_lon     1224705 non-null float64
city                1224696 non-null object
country             1224705 non-null object
currency            1224661 non-null float64
customer_id         1224705 non-null int64
home_add_lat        1224689 non-null float64
home_add_lon        1224689 non-null float64
mcc                 1224705 non-null int64
pos_address         1224705 non-null object
pos_address_lat     1224705 non-null float64
pos_address_lon     1224705 non-null float64
terminal_id         1224705 non-null int64
transaction_date    1224661 non-null object
work_add_lat        664659 non-null float64
work_add_lon        664659 non-null float64
dtypes: float64(10), int64(3), object(5)
memory usage: 177.5+ MB


In [225]:
atm_address = train['atm_address'].unique()
len(atm_address)

17686

In [227]:
atm_address[:10]

array([0, 'R\\MINERALNYE VO\\357202    RUSRUS',
       'YA STR  3\\MOSCOW\\109341    RUSRUS',
       'Москва, ул. Таганская, д. 17-23', 'OKSARY\\             RUS',
       '8-LUBYANKA\\MOSCOW\\123456    RUSRUS',
       'Екатеринбург, ул. Металлургов, д. 87',
       'YA ST.\\KHIMKI\\141400    RUSRUS', 'EREPOVETS\\162608    RUSRUS',
       'IAN KUTA VILL\\RENON\\UNKNOWN   IDNIDN'], dtype=object)

In [None]:
train.to_csv

In [229]:
pd.DataFrame(atm_address).to_csv('atm_address.csv')

In [5]:
atm_address_count = train['atm_address'].value_counts()

In [8]:
atm_address_count.to_csv('atm_address_count.csv')

In [None]:
train[train['atm_address'] == '']

In [32]:
train['atm_address'][train['atm_address'].str.contains('Москва') == True][:5]

942660    Москва, ул. Таганская, д. 17-23
942661    Москва, ул. Таганская, д. 17-23
942662    Москва, ул. Таганская, д. 17-23
942663    Москва, ул. Таганская, д. 17-23
942664    Москва, ул. Таганская, д. 17-23
Name: atm_address, dtype: object

In [14]:
city = train['city'][:20]
city

0     ST PETERSBURG
1     ST PETERSBURG
2     St Petersburg
3     ST PETERSBURG
4     ST PETERSBURG
5     ST PETERSBURG
6     ST PETERSBURG
7     ST PETERSBURG
8     ST PETERSBURG
9     ST PETERSBURG
10    SANKT-PETERBU
11      NOVOSIBIRSK
12      NOVOSIBIRSK
13      NOVOSIBIRSK
14      NOVOSIBIRSK
15      NOVOSIBIRSK
16      NOVOSIBIRSK
17      NOVOSIBIRSK
18      NOVOSIBIRSK
19      NOVOSIBIRSK
Name: city, dtype: object

In [23]:
st = city.str.contains('ST')
st

0      True
1      True
2     False
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: city, dtype: bool

In [27]:
st[st == True].index

Int64Index([0, 1, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [28]:
city[city.str.contains('ST') == True]

0    ST PETERSBURG
1    ST PETERSBURG
3    ST PETERSBURG
4    ST PETERSBURG
5    ST PETERSBURG
6    ST PETERSBURG
7    ST PETERSBURG
8    ST PETERSBURG
9    ST PETERSBURG
Name: city, dtype: object

In [40]:
# непонятный адрес "RADNAYA\RUS,445" есть только у 5912 и 1498 клиентов
train['customer_id'][train['atm_address'].str.contains('RADNAYA') == True].unique()

array([5912, 1498], dtype=int64)

In [44]:
c5912_RADNAYA = train[(train['customer_id'] == 5912) & (train['atm_address'].str.contains('RADNAYA') == True)]
c5912_RADNAYA

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1038989,4.605478,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527
1038990,4.595060,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527
1038991,4.310938,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038992,4.302670,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038993,4.286561,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038994,4.597065,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527
1165574,4.463042,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,117773,2017-06-17,44.369,41.527
1165575,4.489924,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,117773,2017-08-22,44.369,41.527
1165576,4.600828,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,117773,2017-05-03,44.369,41.527
1165577,4.607250,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,117773,2017-08-16,44.369,41.527


посмотрим можно ли определить адрес по terminal_id

In [45]:
# уникальные терминалы у клиента 5912 с адресом RADNAYA
c5912_RADNAYA['terminal_id'].unique()

array([144922, 117773], dtype=int64)

In [54]:
# терминал 144922 только у этого клиента
print(train['customer_id'][train['terminal_id'] == 144922].value_counts())
train[train['terminal_id'] == 144922]

5912    6
Name: customer_id, dtype: int64


Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1038989,4.605478,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527
1038990,4.59506,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527
1038991,4.310938,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038992,4.30267,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038993,4.286561,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-06-19,44.369,41.527
1038994,4.597065,RADNAYA\ RUS,,,OTRADNAYA,RUS,643.0,5912,44.369,41.527,6011,,,,144922,2017-10-09,44.369,41.527


In [55]:
# терминал 117773 тоже только у этого клиента
train['customer_id'][train['terminal_id'] == 117773].value_counts()

5912    439
Name: customer_id, dtype: int64

In [127]:
train1 = train

In [86]:
train1[train1['atm_address'] == atm_address_count.index[45]].index

Int64Index([1038989, 1038990, 1038991, 1038992, 1038993, 1038994, 1165574,
            1165575, 1165576, 1165577,
            ...
            1166003, 1166004, 1166005, 1166006, 1166007, 1166008, 1166009,
            1166010, 1166011, 1166012],
           dtype='int64', length=445)

In [92]:
train1.drop(train1[train1['atm_address'] == atm_address_count.index[45]].index, axis=0, inplace=True)

In [69]:
atm_address_count.index[45]

'RADNAYA\\             RUS'

In [72]:
print(atm_address_count.index[43:48])

Index(['Москва, ул. Сергия Радонежского, д. 4',
       'Москва, ул. 1-я Тверская-Ямская, д. 15', 'RADNAYA\             RUS',
       'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)',
       'Анапа, ул. Шевченко, д. 73Б'],
      dtype='object')


In [93]:
atm_address_count1 = train1['atm_address'].value_counts()
atm_address_count1.index[43:48]

Index(['Москва, ул. Сергия Радонежского, д. 4',
       'Москва, ул. 1-я Тверская-Ямская, д. 15',
       'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)',
       'Анапа, ул. Шевченко, д. 73Б', 'Москва, ш. Симферопольское, д. 3'],
      dtype='object')

In [95]:
atm_address_count.index[46]

'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)'

In [96]:
train['customer_id'][train['atm_address'] == atm_address_count.index[46]].unique()

array([1177, 8625, 7463,  116, 7998, 5088, 1035, 8376, 2234, 6301, 8499,
       1632,  281, 5505, 2520, 5195, 2690, 6784, 4403, 7899, 4141, 9656,
       2453, 5711, 8387, 3212, 9290, 3269, 6152, 3321, 3238, 8279, 4051], dtype=int64)

In [101]:
train1['atm_address'][train1['atm_address'] == atm_address_count.index[46]].value = 'Подольск, ул. Полевановская, д. 9'

In [108]:
train1['atm_address'].fillna(0, inplace=True)

In [110]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1224260 entries, 0 to 1224733
Data columns (total 18 columns):
amount              1224260 non-null float64
atm_address         1224260 non-null object
atm_address_lat     184680 non-null float64
atm_address_lon     184680 non-null float64
city                1224251 non-null object
country             1224260 non-null object
currency            1224216 non-null float64
customer_id         1224260 non-null int64
home_add_lat        1224244 non-null float64
home_add_lon        1224244 non-null float64
mcc                 1224260 non-null int64
pos_address         924991 non-null object
pos_address_lat     942614 non-null float64
pos_address_lon     942614 non-null float64
terminal_id         1224260 non-null int64
transaction_date    1224216 non-null object
work_add_lat        664214 non-null float64
work_add_lon        664214 non-null float64
dtypes: float64(10), int64(3), object(5)
memory usage: 217.5+ MB


In [128]:
train1[train1['atm_address'] == atm_address_count.index[46]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1173134,2.987312,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-03-02,55.425,37.576
1173135,4.006400,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-11-24,55.425,37.576
1173136,3.606935,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-03-24,55.425,37.576
1173137,3.014655,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-02-14,55.425,37.576
1173138,3.854688,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-06-23,55.425,37.576
1173139,3.463546,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-02-14,55.425,37.576
1173140,2.998107,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-07-21,55.425,37.576
1173141,4.492525,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,1177,55.398,37.812,6011,,,,129694,2017-08-23,55.425,37.576
1173142,2.690201,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,8625,55.420,37.522,6011,,,,129694,2017-02-23,55.425,37.576
1173143,2.694462,"Подольск, ул. Полевановская, д. 9, (Заезд с ул...",,,PODOLSK,RUS,643.0,8625,55.420,37.522,6011,,,,129694,2017-06-25,55.425,37.576


In [118]:
atm_address_count.index[46]

'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)'

In [131]:
train1['atm_address'].replace(to_replace=atm_address_count.index[46], value='Подольск, ул. Полевановская, д. 9', inplace=True)

In [132]:
train1[train1['atm_address'] == atm_address_count.index[46]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon


In [133]:
atm_address_count.index[146]

'>  Moskv\\Moskva\\00000     RUSRUS'

In [178]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1224705 entries, 0 to 1224733
Data columns (total 18 columns):
amount              1224705 non-null float64
atm_address         1224705 non-null object
atm_address_lat     184680 non-null float64
atm_address_lon     184680 non-null float64
city                1224696 non-null object
country             1224705 non-null object
currency            1224661 non-null float64
customer_id         1224705 non-null int64
home_add_lat        1224689 non-null float64
home_add_lon        1224689 non-null float64
mcc                 1224705 non-null int64
pos_address         924991 non-null object
pos_address_lat     942614 non-null float64
pos_address_lon     942614 non-null float64
terminal_id         1224705 non-null int64
transaction_date    1224661 non-null object
work_add_lat        664659 non-null float64
work_add_lon        664659 non-null float64
dtypes: float64(10), int64(3), object(5)
memory usage: 217.5+ MB


In [192]:
string = 'dfgd\dgd'

In [198]:
'\\' in string

True

In [206]:
ser = pd.Series(data = ['asd', 'as\df', 'wew\\\ewr'])

In [207]:
ser

0         asd
1       as\df
2    wew\\ewr
dtype: object

In [218]:
ser.str.contains('\\\\')

0    False
1     True
2     True
dtype: bool

In [223]:
train[train['atm_address'].str.contains("\\\\") == True]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
942658,3.541696,R\MINERALNYE VO\357202 RUSRUS,,,MINERALNYE VO,RUS,643.0,6544,55.598,37.045,6011,,,,0,2017-03-05,55.713,37.722
942659,3.304900,YA STR 3\MOSCOW\109341 RUSRUS,,,MOSCOW,RUS,643.0,2821,55.600,38.116,6011,,,,1,2017-02-18,55.609,38.074
942731,2.678472,OKSARY\ RUS,,,CHEBOKSARY,RUS,643.0,9492,56.134,47.255,6011,,,,21,2017-04-11,56.129,47.259
942732,3.321838,8-LUBYANKA\MOSCOW\123456 RUSRUS,,,MOSCOW,RUS,643.0,7677,55.800,37.687,6011,,,,26,2017-09-19,,
942807,2.306179,YA ST.\KHIMKI\141400 RUSRUS,,,KHIMKI,RUS,643.0,472,55.853,37.475,6011,,,,136,2017-11-21,,
942808,4.191508,YA ST.\KHIMKI\141400 RUSRUS,,,KHIMKI,RUS,643.0,2351,55.956,37.301,6011,,,,136,2017-12-05,55.785,37.391
942809,3.692520,YA ST.\KHIMKI\141400 RUSRUS,,,KHIMKI,RUS,643.0,4066,55.883,37.431,6011,,,,136,2017-06-19,55.796,37.577
942810,2.689679,EREPOVETS\162608 RUSRUS,,,CHEREPOVETS,RUS,643.0,6188,59.103,37.935,6011,,,,152,2017-10-18,,
942811,3.337487,IAN KUTA VILL\RENON\UNKNOWN IDNIDN,,,RENON,IDN,360.0,2162,55.753,37.633,6011,,,,181,2017-02-15,55.795,37.528
942812,3.742778,"RIDIEN,\RENON\UNKNOWN IDNIDN",,,RENON,IDN,360.0,5942,55.843,37.362,6011,,,,181,2017-09-04,,


In [226]:
train['atm_address'][train['atm_address'].str.contains("\\\\") == True].value_counts()[:5]

RADNAYA\             RUS                                   445
, ul. Marosejka, d. 3/13\Moskva\107045    45 RUS           403
>  Moskv\Moskva\00000     RUSRUS                           297
, 13        >  Krons\Kronshtadt\00000     RUSRUS           272
ITER B 13-YA V.O. LINIYA\SANKT-PETERBU\199034    RUSRUS    245
Name: atm_address, dtype: int64

In [228]:
len(train['atm_address'][train['atm_address'].str.contains("\\\\") == True].unique())

16009

In [271]:
backslash_atm_address_count = train['atm_address'][train['atm_address'].str.contains("\\\\") == True].value_counts()

In [272]:
backslash_atm_address_count.to_csv('backslash_atm_address_count.csv')

In [275]:
backslash_atm_address_count.index[12]

'VOMYTISHENSKY PROSPEKT 34/2 OTDELENYE UNICREDIT\\MOSCOW\\0722         RUS'

In [232]:
train['atm_address'][train['atm_address'].str.contains("0000") == True].value_counts()[:5]

>  Moskv\Moskva\00000     RUSRUS                             297
, 13        >  Krons\Kronshtadt\00000     RUSRUS             272
>  Krasn\Krasnoyarsk\00000     RUSRUS                        219
LOVSKIY PR,11.32  >  Sankt\Sankt-Peterbu\00000     RUSRUS    155
A, 32,34      >  Podol\Podolsk\00000     RUSRUS              121
Name: atm_address, dtype: int64

In [270]:
train['atm_address'][train['atm_address'].str.contains("RUSRUS") == True].value_counts()[:5]

>  Moskv\Moskva\00000     RUSRUS                             297
, 13        >  Krons\Kronshtadt\00000     RUSRUS             272
ITER B 13-YA V.O. LINIYA\SANKT-PETERBU\199034    RUSRUS      245
>  Krasn\Krasnoyarsk\00000     RUSRUS                        219
LOVSKIY PR,11.32  >  Sankt\Sankt-Peterbu\00000     RUSRUS    155
Name: atm_address, dtype: int64

In [239]:
wrong_atm_address_galka = train['atm_address'][train['atm_address'].str.contains(">") == True].value_counts()

In [240]:
wrong_atm_address_galka.to_csv('wrong_atm_address_galka.csv')

In [255]:
wrong_atm_address_galka.index[1]

'>  Moskv\\Moskva\\00000     RUSRUS'

In [261]:
train['terminal_id'][(train['atm_address'] == wrong_atm_address_galka.index[0])].unique()

array([ 26952,  34649,  49344,  74366, 153404, 179763, 186385, 187732,
        27409,  38207,  47594,  55945,  61264, 115069, 145240, 150054,
       165192, 176380], dtype=int64)

In [268]:
train[(train['terminal_id'] == 176380) & (train['pos_address_lat'] != 0)]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon


In [269]:
train[train['atm_address'] == ', ul. Marosejka, d. 3/13\Moskva\107045    45 RUS,403']

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon


In [291]:
train1 = train

In [294]:
train1.drop(train1[train1['atm_address'].str.contains("\\\\") == True].index, inplace=True)

In [295]:
train1['atm_address'][train1['atm_address'].str.contains("\\\\") == True]

Series([], Name: atm_address, dtype: object)

#### pos_address

In [303]:
train['atm_address_lat']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
5             NaN
6             NaN
7             NaN
8             NaN
9             NaN
10            NaN
11            NaN
12            NaN
13            NaN
14            NaN
15            NaN
16            NaN
17            NaN
18            NaN
19            NaN
20            NaN
21            NaN
22            NaN
23            NaN
24            NaN
25            NaN
26            NaN
27            NaN
28            NaN
29            NaN
            ...  
1224704    51.556
1224705    51.561
1224706    51.559
1224707    51.560
1224708    51.557
1224709    51.557
1224710    51.560
1224711    51.560
1224712    51.556
1224713    51.555
1224714       NaN
1224715       NaN
1224716       NaN
1224717       NaN
1224718       NaN
1224719       NaN
1224720       NaN
1224721       NaN
1224722       NaN
1224723       NaN
1224724       NaN
1224725       NaN
1224726       NaN
1224727       NaN
1224728   

In [308]:
train.iloc[1000000]

amount                                          3.06293
atm_address         Санкт-Петербург, ул. Ефимова, д. 4А
atm_address_lat                                  59.922
atm_address_lon                                  30.322
city                                      ST PETERSBURG
country                                             RUS
currency                                            643
customer_id                                        5486
home_add_lat                                     59.962
home_add_lon                                     30.286
mcc                                                6011
pos_address                                         NaN
pos_address_lat                                     NaN
pos_address_lon                                     NaN
terminal_id                                      109579
transaction_date                             2017-11-15
work_add_lat                                     59.999
work_add_lon                                    

In [313]:
train['atm_address'][train['customer_id'] == 5486].unique()

array([nan, 'Санкт-Петербург, пр. Комендантский, д. 13, корп.1',
       'Санкт-Петербург, ул. Ефимова, д. 4А',
       'Санкт-Петербург, пр. Комендантский, д. 3',
       'Санкт-Петербург, пр. Каменоостровский, д. 13'], dtype=object)

In [314]:
train[train['atm_address'] == 'Санкт-Петербург, пр. Комендантский, д. 13, корп.1']

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
997303,3.186045,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.006,30.260,ST-PETERSBURG,RUS,643.0,5931,60.039,30.301,6011,,,,72272,2017-10-26,59.954,30.288
997304,4.283389,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.011,30.256,ST-PETERSBURG,RUS,643.0,5931,60.039,30.301,6011,,,,72272,2017-08-15,59.954,30.288
997305,4.864826,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.014,30.261,ST-PETERSBURG,RUS,643.0,5323,60.045,30.319,6011,,,,72272,2017-05-01,59.994,30.281
997306,5.155668,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.014,30.254,ST-PETERSBURG,RUS,643.0,5323,60.045,30.319,6011,,,,72272,2017-04-30,59.994,30.281
997307,3.758354,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.008,30.257,ST-PETERSBURG,RUS,643.0,7220,60.015,30.265,6011,,,,72272,2017-06-26,,
997308,3.680902,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.008,30.258,ST-PETERSBURG,RUS,643.0,1681,55.748,37.605,6011,,,,72272,2017-04-20,55.655,37.539
997309,4.187703,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.012,30.260,ST-PETERSBURG,RUS,643.0,1681,55.748,37.605,6011,,,,72272,2017-05-04,55.655,37.539
997310,3.596002,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.011,30.262,ST-PETERSBURG,RUS,643.0,7118,60.008,30.223,6011,,,,72272,2017-09-11,60.008,30.223
997311,5.196073,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.009,30.262,ST-PETERSBURG,RUS,643.0,7118,60.008,30.223,6011,,,,72272,2017-09-01,60.008,30.223
997312,3.912952,"Санкт-Петербург, пр. Комендантский, д. 13, корп.1",60.007,30.260,ST-PETERSBURG,RUS,643.0,1681,55.748,37.605,6011,,,,72272,2017-07-24,55.655,37.539


In [315]:
train['pos_address'][train['customer_id'] == 5486].unique()

array(['13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000    RUSRUS',
       '13 LEVASHOVSKIY PR-TSANKT-PETERBU197110    RUSRUS',
       '3A, KOMENDANTSKIY PR.SANKT-PETERBU197227    RUSRUS',
       'LEVASHOVSKIY PR., D. 21, ASANKT-PETERBU190000    RUSRUS',
       '14-18 B.ZELENINA STRSANKT-PETERBU190000    RUSRUS',
       '34 GAKKELEVSKAYA STRSANKT-PETERBU190000    RUSRUS',
       '15 CHKALOVSKIY PR-T BLD 13,SANKT-PETERBU190000    RUSRUS',
       '2A VOLODARSKOGO STRSESTRORETSK197706    RUSRUS',
       '16,B.ZELENINAS.PETERBURG197101    RUSRUS',
       '16 B.ZELENINA STRSANKT-PETERBU190000    RUSRUS',
       '7 A GEROYA SOVETSKOGO SOYUZAN.NOVGOROD603000    RUSRUS',
       '25 A MICHURINA STRLYSKOVO606210    RUSRUS',
       '1 KAZANSKAYA STR BLD G POM.PLYSKOVO606210    RUSRUS',
       '(197110), Sankt-Peterburg g, Bolshaya ZSpb197110    40 RUS',
       '1 KAZANSKAYA STRLYSKOVO606210    RUSRUS',
       '42 KAMENNOOSTROVSKIY PR-T BLSANKT-PETERBU197022    RUSRUS',
       '13 BOLSHAYA ZELENINA ST

In [316]:
train[train['pos_address'] == '13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000    RUSRUS']

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
25879,2.794183,,,,SANKT-PETERBU,RUS,643.0,4361,55.717,37.615,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,128427,2017-07-07,55.768,37.591
50970,3.298309,,,,SANKT-PETERBU,RUS,643.0,4536,59.969,30.310,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,93759,2017-09-03,59.937,30.323
88547,2.271136,,,,SANKT-PETERBU,RUS,643.0,8424,60.101,30.223,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,116436,2017-10-10,60.220,30.276
103348,2.592405,,,,SANKT-PETERBU,RUS,643.0,3522,59.966,30.284,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,93759,2017-05-22,59.941,30.300
150063,1.834357,,,,SANKT-PETERBU,RUS,643.0,3658,59.950,30.365,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,155795,2017-07-15,,
163546,2.037888,,,,SANKT-PETERBU,RUS,643.0,1016,52.717,56.621,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,157748,2017-06-13,,
163547,2.115034,,,,SANKT-PETERBU,RUS,643.0,1016,52.717,56.621,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,157748,2017-06-09,,
167126,3.257722,,,,SANKT-PETERBU,RUS,643.0,80,60.139,30.218,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,157748,2017-10-22,,
284588,1.617268,,,,SANKT-PETERBU,RUS,643.0,8571,59.947,30.412,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,128427,2017-08-09,,
284594,1.998302,,,,SANKT-PETERBU,RUS,643.0,8571,59.947,30.412,5411,13 LEVASHOVSKIY PR-T BLDSANKT-PETERBU190000 ...,59.966687,30.287019,128427,2017-08-18,,


In [319]:
train[(train['atm_address_lat'] != 0) & (train['pos_address_lat'] != 0)]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon


#### проверить, что координаты pos нормальные

In [8]:
pos_address = train['pos_address'].unique()
len(pos_address)

88958

In [25]:
count = 0
position = []
k = 0
for address in tqdm(pos_address):
#for address in pos_address:
    if address == 0: continue
    k += 1
    pos1 = train[train['pos_address']==address]  
    lat0 = pos1['pos_address_lat'].values[0]
    lon0 = pos1['pos_address_lon'].values[0]
    for i in range(pos1.shape[0]):
        if (lat0 != pos1['pos_address_lat'].values[i]) | (lon0 != pos1['pos_address_lon'].values[i]):
            count += 1
            position.append(address)
            with open('pos_address.txt', 'w') as file:
                for item in position:
                    file.write("%s\n" % item)
            break
    if (k%1000 == 0): print(k,' ',count)
print('count = ',count)


  0%|                                                                                        | 0/88958 [00:00<?, ?it/s]
  0%|                                                                              | 2/88958 [00:00<1:25:41, 17.30it/s]
  0%|                                                                              | 3/88958 [00:00<1:51:31, 13.29it/s]
  0%|                                                                              | 4/88958 [00:00<2:02:13, 12.13it/s]
  0%|                                                                              | 5/88958 [00:00<2:11:54, 11.24it/s]
  0%|                                                                              | 6/88958 [00:00<2:21:04, 10.51it/s]
  0%|                                                                              | 7/88958 [00:00<2:27:43, 10.04it/s]
  0%|                                                                              | 8/88958 [00:00<2:29:56,  9.89it/s]
  0%|                                  

1000   0


  2%|█▋                                                                         | 2000/88958 [03:17<2:23:03, 10.13it/s]

2000   0


  3%|██▌                                                                        | 2999/88958 [04:59<2:23:09, 10.01it/s]

3000   0


  4%|███▎                                                                       | 3999/88958 [06:25<2:16:25, 10.38it/s]

4000   1


  6%|████▏                                                                      | 5000/88958 [08:12<2:17:46, 10.16it/s]

5000   1


  7%|█████                                                                      | 5999/88958 [10:17<2:22:20,  9.71it/s]

6000   1


  8%|█████▉                                                                     | 7000/88958 [12:07<2:21:57,  9.62it/s]

7000   3


  9%|██████▋                                                                    | 8000/88958 [14:14<2:24:11,  9.36it/s]

8000   3


 10%|███████▌                                                                   | 8999/88958 [15:50<2:20:46,  9.47it/s]

9000   3


 11%|████████▎                                                                 | 10000/88958 [17:46<2:20:18,  9.38it/s]

10000   4


 12%|█████████▏                                                                | 10999/88958 [19:35<2:18:49,  9.36it/s]

11000   4


 13%|█████████▉                                                                | 11999/88958 [21:03<2:15:01,  9.50it/s]

12000   4


 15%|██████████▊                                                               | 13000/88958 [22:45<2:12:55,  9.52it/s]

13000   4


 16%|███████████▋                                                              | 14000/88958 [24:29<2:11:07,  9.53it/s]

14000   4


 17%|████████████▍                                                             | 15000/88958 [26:25<2:10:16,  9.46it/s]

15000   4


 18%|█████████████▎                                                            | 16000/88958 [28:21<2:09:18,  9.40it/s]

16000   4


 19%|██████████████▏                                                           | 16999/88958 [30:17<2:08:12,  9.35it/s]

17000   5


 20%|██████████████▉                                                           | 18000/88958 [31:52<2:05:39,  9.41it/s]

18000   5


 21%|███████████████▊                                                          | 19000/88958 [34:33<2:07:16,  9.16it/s]

19000   5


 22%|████████████████▋                                                         | 19999/88958 [36:18<2:05:12,  9.18it/s]

20000   6


 24%|█████████████████▍                                                        | 20999/88958 [37:38<2:01:49,  9.30it/s]

21000   6


 25%|██████████████████▎                                                       | 21999/88958 [39:00<1:58:43,  9.40it/s]

22000   6


 26%|███████████████████▏                                                      | 22999/88958 [40:21<1:55:43,  9.50it/s]

23000   6


 27%|███████████████████▉                                                      | 23999/88958 [41:40<1:52:49,  9.60it/s]

24000   6


 28%|████████████████████▊                                                     | 24999/88958 [43:00<1:50:03,  9.69it/s]

25000   6


 29%|█████████████████████▋                                                    | 25999/88958 [44:20<1:47:22,  9.77it/s]

26000   6


 30%|██████████████████████▍                                                   | 26999/88958 [45:46<1:45:03,  9.83it/s]

27000   6


 31%|███████████████████████▎                                                  | 28000/88958 [47:20<1:43:04,  9.86it/s]

28000   6


 33%|████████████████████████                                                  | 28999/88958 [48:58<1:41:15,  9.87it/s]

29000   6


 34%|████████████████████████▉                                                 | 30000/88958 [50:25<1:39:06,  9.92it/s]

30000   6


 35%|█████████████████████████▊                                                | 30999/88958 [51:46<1:36:49,  9.98it/s]

31000   6


 36%|██████████████████████████▌                                               | 32000/88958 [53:17<1:34:51, 10.01it/s]

32000   6


 37%|███████████████████████████▍                                              | 32999/88958 [54:47<1:32:55, 10.04it/s]

33000   6


 38%|████████████████████████████▎                                             | 34000/88958 [56:39<1:31:35, 10.00it/s]

34000   6


 39%|█████████████████████████████                                             | 34999/88958 [58:24<1:30:02,  9.99it/s]

35000   6


 40%|█████████████████████████████▉                                            | 35999/88958 [59:47<1:27:57, 10.04it/s]

36000   6


 42%|█████████████████████████████▉                                          | 36999/88958 [1:01:15<1:26:01, 10.07it/s]

37000   7


 43%|██████████████████████████████▊                                         | 38000/88958 [1:02:37<1:23:59, 10.11it/s]

38000   7


 44%|███████████████████████████████▌                                        | 39000/88958 [1:04:06<1:22:07, 10.14it/s]

39000   8


 45%|████████████████████████████████▎                                       | 39999/88958 [1:05:35<1:20:17, 10.16it/s]

40000   9


 46%|█████████████████████████████████▏                                      | 40999/88958 [1:07:05<1:18:29, 10.18it/s]

41000   9


 47%|█████████████████████████████████▉                                      | 42000/88958 [1:08:32<1:16:38, 10.21it/s]

42000   9


 48%|██████████████████████████████████▊                                     | 42999/88958 [1:09:53<1:14:42, 10.25it/s]

43000   9


 49%|███████████████████████████████████▌                                    | 44000/88958 [1:11:18<1:12:51, 10.29it/s]

44000   9


 51%|████████████████████████████████████▍                                   | 45000/88958 [1:12:44<1:11:03, 10.31it/s]

45000   9


 52%|█████████████████████████████████████▏                                  | 46000/88958 [1:14:10<1:09:16, 10.34it/s]

46000   9


 53%|██████████████████████████████████████                                  | 47000/88958 [1:15:34<1:07:27, 10.37it/s]

47000   9


 54%|██████████████████████████████████████▊                                 | 48000/88958 [1:17:00<1:05:42, 10.39it/s]

48000   9


 55%|███████████████████████████████████████▋                                | 48999/88958 [1:18:31<1:04:01, 10.40it/s]

49000   9


 56%|████████████████████████████████████████▍                               | 50000/88958 [1:20:01<1:02:21, 10.41it/s]

50000   9


 57%|█████████████████████████████████████████▎                              | 51000/88958 [1:21:35<1:00:43, 10.42it/s]

51000   9


 58%|███████████████████████████████████████████▎                              | 52000/88958 [1:23:02<59:01, 10.44it/s]

52000   10


 60%|████████████████████████████████████████████                              | 53000/88958 [1:24:43<57:28, 10.43it/s]

53000   10


 61%|████████████████████████████████████████████▉                             | 53999/88958 [1:26:09<55:46, 10.45it/s]

54000   10


 62%|█████████████████████████████████████████████▊                            | 55000/88958 [1:27:34<54:04, 10.47it/s]

55000   11


 63%|██████████████████████████████████████████████▌                           | 56000/88958 [1:29:02<52:24, 10.48it/s]

56000   11


 64%|███████████████████████████████████████████████▍                          | 56999/88958 [1:30:34<50:46, 10.49it/s]

57000   11


 65%|████████████████████████████████████████████████▏                         | 57999/88958 [1:32:12<49:12, 10.48it/s]

58000   11


 66%|█████████████████████████████████████████████████                         | 59000/88958 [1:33:40<47:33, 10.50it/s]

59000   11


 67%|█████████████████████████████████████████████████▉                        | 59999/88958 [1:35:12<45:57, 10.50it/s]

60000   11


 69%|██████████████████████████████████████████████████▋                       | 60999/88958 [1:36:35<44:16, 10.52it/s]

61000   11


 70%|███████████████████████████████████████████████████▌                      | 62000/88958 [1:38:08<42:40, 10.53it/s]

62000   11


 71%|████████████████████████████████████████████████████▍                     | 63000/88958 [1:39:34<41:01, 10.54it/s]

63000   11


 72%|█████████████████████████████████████████████████████▏                    | 63999/88958 [1:41:00<39:23, 10.56it/s]

64000   11


 73%|██████████████████████████████████████████████████████                    | 65000/88958 [1:42:27<37:45, 10.57it/s]

65000   11


 74%|██████████████████████████████████████████████████████▉                   | 66000/88958 [1:44:06<36:12, 10.57it/s]

66000   11


 75%|███████████████████████████████████████████████████████▋                  | 67000/88958 [1:45:41<34:38, 10.57it/s]

67000   12


 76%|████████████████████████████████████████████████████████▌                 | 67999/88958 [1:47:07<33:01, 10.58it/s]

68000   12


 78%|█████████████████████████████████████████████████████████▍                | 68999/88958 [1:48:33<31:24, 10.59it/s]

69000   12


 79%|██████████████████████████████████████████████████████████▏               | 69999/88958 [1:50:07<29:49, 10.59it/s]

70000   13


 80%|███████████████████████████████████████████████████████████               | 71000/88958 [1:51:34<28:13, 10.61it/s]

71000   13


 81%|███████████████████████████████████████████████████████████▉              | 71999/88958 [1:53:15<26:40, 10.60it/s]

72000   13


 82%|████████████████████████████████████████████████████████████▋             | 72999/88958 [1:54:47<25:05, 10.60it/s]

73000   13


 83%|█████████████████████████████████████████████████████████████▌            | 74000/88958 [1:56:15<23:30, 10.61it/s]

74000   13


 84%|██████████████████████████████████████████████████████████████▍           | 74999/88958 [1:58:24<22:02, 10.56it/s]

75000   13


 85%|███████████████████████████████████████████████████████████████▏          | 75999/88958 [1:59:54<20:26, 10.56it/s]

76000   13


 87%|████████████████████████████████████████████████████████████████          | 77000/88958 [2:01:33<18:52, 10.56it/s]

77000   13


 88%|████████████████████████████████████████████████████████████████▉         | 78000/88958 [2:03:08<17:17, 10.56it/s]

78000   13


 89%|█████████████████████████████████████████████████████████████████▋        | 79000/88958 [2:04:42<15:43, 10.56it/s]

79000   13


 90%|██████████████████████████████████████████████████████████████████▌       | 79999/88958 [2:06:45<14:11, 10.52it/s]

80000   13


 91%|███████████████████████████████████████████████████████████████████▍      | 81000/88958 [2:08:09<12:35, 10.53it/s]

81000   13


 92%|████████████████████████████████████████████████████████████████████▏     | 82000/88958 [2:09:37<10:59, 10.54it/s]

82000   13


 93%|█████████████████████████████████████████████████████████████████████     | 82999/88958 [2:11:04<09:24, 10.55it/s]

83000   13


 94%|█████████████████████████████████████████████████████████████████████▊    | 83999/88958 [2:12:26<07:49, 10.57it/s]

84000   13


 96%|██████████████████████████████████████████████████████████████████████▋   | 84999/88958 [2:13:46<06:13, 10.59it/s]

85000   13


 97%|███████████████████████████████████████████████████████████████████████▌  | 85999/88958 [2:15:05<04:38, 10.61it/s]

86000   13


 98%|████████████████████████████████████████████████████████████████████████▎ | 86999/88958 [2:16:24<03:04, 10.63it/s]

87000   13


 99%|█████████████████████████████████████████████████████████████████████████▏| 87999/88958 [2:17:43<01:30, 10.65it/s]

88000   13


100%|██████████████████████████████████████████████████████████████████████████| 88958/88958 [2:19:00<00:00, 10.67it/s]


count =  13


In [26]:
position

['B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS',
 'BLAGOVESCHENSKAYAVOLOGDA160001    RUSRUS',
 'SUSCHEVSKIJ VAL 15MOSKVA127018    RUSRUS',
 ', KOMSOMOLSKAYA SQ.MOSCOW107140    RUSRUS"',
 '5-1A, SUSCHEVSKIY VALMOSKVA127018    RUSRUS',
 'KOMMUNISTICHESKAYA STR 1MYTISCHY141011    RUSRUS',
 'MIRA STR 51MYTISCHY141008    RUSRUS',
 '42, SCHUKINSKAYAMOSKVA123182    RUSRUS',
 '3 KIEVSKAYA STRSANKT-PETERBU190000    RUSRUS',
 '-2, VETERANOVMYTISCHI141021    RUSRUS',
 '55 BOLSHOY SAMPSONIEVSKIY PRSANKT-PETERBU194044    RUSRUS',
 '20 BOLSHOY SAMPSONIEVSKIY PRSANKT-PETERBU190000    RUSRUS',
 '11 SCHASTLIVAYA STRUSADY422624    RUSRUS']

In [27]:
geolocator = Yandex()

In [97]:
location = geolocator.geocode('B. KAMENSCHIKI, 4MOSCOW')
print(location.latitude, location.longitude)

55.738176 37.653173


In [99]:
# уникальные значения координат
pos = train[train['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS']
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.737567,37.553126,8
1,55.737567,37.653087,11


In [96]:
#'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS'
#location = geolocator.geocode("BLAGOVESCHENSKAYAVOLOGDA")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

In [39]:
train1['pos_address_lat'][train1['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS'] = pos['pos_address_lat'][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [101]:
train1['pos_address_lat'][train1['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS']

10358     55.737567
10408     55.737567
17195     55.737567
17206     55.737567
110529    55.737567
129038    55.737567
258367    55.737567
323851    55.737567
470530    55.737567
470667    55.737567
543508    55.737567
543522    55.737567
543594    55.737567
543610    55.737567
543644    55.737567
745777    55.737567
759081    55.737567
854729    55.737567
854756    55.737567
Name: pos_address_lat, dtype: float64

In [107]:
train['pos_address_lat'][train['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS']

10358     55.737567
10408     55.737567
17195     55.737567
17206     55.737567
110529    55.737567
129038    55.737567
258367    55.737567
323851    55.737567
470530    55.737567
470667    55.737567
543508    55.737567
543522    55.737567
543594    55.737567
543610    55.737567
543644    55.737567
745777    55.737567
759081    55.737567
854729    55.737567
854756    55.737567
Name: pos_address_lat, dtype: float64

In [103]:
train2 = train.copy()

In [105]:
train2['pos_address_lat'][train2['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS'] = 999

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [106]:
train2['pos_address_lat'][train2['pos_address'] == 'B. KAMENSCHIKI, 4MOSCOW123456    RUSRUS']

10358     999.0
10408     999.0
17195     999.0
17206     999.0
110529    999.0
129038    999.0
258367    999.0
323851    999.0
470530    999.0
470667    999.0
543508    999.0
543522    999.0
543594    999.0
543610    999.0
543644    999.0
745777    999.0
759081    999.0
854729    999.0
854756    999.0
Name: pos_address_lat, dtype: float64

'BLAGOVESCHENSKAYAVOLOGDA160001    RUSRUS'

In [126]:
# уникальные значения координат
pos = train[train['pos_address'] == position[1]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,59.22431,39.865715,94
1,59.25589,39.8482,126


In [128]:
pos['pos_address_lat'][0], pos['pos_address_lon'][0]

(59.224310299999999, 39.865715399999999)

In [127]:
location = geolocator.geocode("BLAGOVESCHENSKAYAVOLOGDA")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
#folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

SUSCHEVSKIJ VAL 15MOSKVA127018    RUSRUS

In [110]:
# уникальные значения координат
pos = train[train['pos_address'] == position[2]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.778224,37.584323,93
1,55.795572,37.593703,32


In [115]:
pos['pos_address_lat'][1], pos['pos_address_lon'][1]

(55.795571500000001, 37.593703300000001)

In [112]:
location = geolocator.geocode("SUSCHEVSKIJ VAL 15MOSKVA")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

', KOMSOMOLSKAYA SQ.MOSCOW107140    RUSRUS"'

In [117]:
# уникальные значения координат
pos = train[train['pos_address'] == position[3]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.774048,37.654861,2
1,55.776802,37.657352,7


In [118]:
location = geolocator.geocode("KOMSOMOLSKAYA SQ.MOSCOW")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
#folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

'5-1A, SUSCHEVSKIY VALMOSKVA127018    RUSRUS'

In [120]:
# уникальные значения координат
pos = train[train['pos_address'] == position[4]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.699624,37.623173,1
1,55.792592,37.590429,1


In [121]:
location = geolocator.geocode("5-1A, SUSCHEVSKIY VALMOSKVA")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

In [122]:
train[train['pos_address'] == position[4]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
71229,2.589563,0,0.0,0.0,MOSKVA,RUS,643.0,171,53.981,50.175,7622,"5-1A, SUSCHEVSKIY VALMOSKVA127018 RUSRUS",55.792592,37.590429,112035,2017-07-18,,
754041,2.668986,0,0.0,0.0,MOSKVA,RUS,643.0,5382,55.877,37.651,5732,"5-1A, SUSCHEVSKIY VALMOSKVA127018 RUSRUS",55.699624,37.623173,161986,2017-06-04,55.726,37.642


'KOMMUNISTICHESKAYA STR 1MYTISCHY141011    RUSRUS'

In [132]:
# уникальные значения координат
pos = train[train['pos_address'] == position[5]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,52.593779,103.865708,8
1,55.88763,37.73774,2


In [133]:
location = geolocator.geocode("'KOMMUNISTICHESKAYA STR 1MYTISCHY")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

MIRA STR 51MYTISCHY141008    RUSRUS

In [135]:
# уникальные значения координат
pos = train[train['pos_address'] == position[6]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,43.06342,44.65267,3
1,55.920456,37.708613,1


In [136]:
location = geolocator.geocode("MIRA STR 51MYTISCHY")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

'42, SCHUKINSKAYAMOSKVA123182    RUSRUS'

In [139]:
# уникальные значения координат
pos = train[train['pos_address'] == position[7]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.669151,37.43685,10
1,55.8093,37.465638,1


In [141]:
location = geolocator.geocode("42, SCHUKINSKAYA MOSKVA")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

8

In [147]:
pos['pos_address_lat'][0], pos['pos_address_lat'][1]

(59.901675750000003, 59.901675750000003)

In [148]:
pos['pos_address_lon'][0], pos['pos_address_lon'][1]

(30.320608365300004, 30.320608365399998)

In [149]:
pos = train[train['pos_address'] == position[8]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,59.901676,30.320608,1
1,59.901676,30.320608,10


In [143]:
location = geolocator.geocode("Санкт-Петербург киевская 3")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

9

In [150]:
pos = train[train['pos_address'] == position[9]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,44.610649,40.139082,12
1,55.976357,37.607856,10


In [152]:
location = geolocator.geocode("2, VETERANOV MYTISCHI")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

10

In [153]:
pos = train[train['pos_address'] == position[10]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,59.972041,30.340491,10
1,59.972215,30.340456,1


In [155]:
location = geolocator.geocode("55 BOLSHOY SAMPSONIEVSKIY PR SANKT-PETERBURG")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

11

In [156]:
pos = train[train['pos_address'] == position[11]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,59.960381,30.344936,1
1,59.993598,30.332357,6


In [157]:
location = geolocator.geocode("20 BOLSHOY SAMPSONIEVSKIY PRSANKT-PETERBURG")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m

12

In [158]:
pos = train[train['pos_address'] == position[12]]
pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})
pos

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,55.673302,49.22246,4
1,55.68362,49.212692,22


In [164]:
location = geolocator.geocode("Счастливая улица, 11, Усады")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
for i in range(pos.shape[0]):
    pos_lat = pos['pos_address_lat'][i]
    pos_lon = pos['pos_address_lon'][i]

    folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)

m