In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import folium
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Yandex
from geopy.distance import vincenty
import time
from tqdm import tqdm

In [2]:
train = pd.read_csv('data/train_set.csv')
test = pd.read_csv('data/test_set.csv')

train.rename(columns={ 'pos_adress_lat' : 'pos_address_lat', 
                       'pos_adress_lon' : 'pos_address_lon'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Заполнение пропуска координат дома
customer = train['customer_id'][train['home_add_lon'].isnull() == True].unique()[0]
train['home_add_lon'][train['customer_id'] == customer] = train['home_add_lon'][train['customer_id'] == customer].fillna(method='bfill').fillna(method='ffill')
train['home_add_lat'][train['customer_id'] == customer] = train['home_add_lat'][train['customer_id'] == customer].fillna(method='bfill').fillna(method='ffill')

#Удалить клиентов, у которых нет работы
train.drop(train[train['work_add_lon'].isnull() == True].index,inplace=True)

# Удалить транзакции, где нет адреса координат atm, pos и terminal_id (в трейне это все строчки, где нет terminal_id)
train.drop(train[train['terminal_id'].isnull() == True].index, axis=0, inplace=True)

###########################################################################################
# Подсчет частот адресов
atm_address_count = train['atm_address'].value_counts()

# Удаление транзакций с atm_address = "RADNAYA\             RUS,445" (по нему координат нет)
# train.drop(train[train['atm_address'] == atm_address_count.index[45]].index, axis=0, inplace=True)

# Замена 'Подольск, ул. Полевановская, д. 9, (Заезд с ул. Орджоникидзе д. 25)' на 'Подольск, ул. Полевановская, д. 9'
train['atm_address'].replace(to_replace=atm_address_count.index[46], value='Подольск, ул. Полевановская, д. 9', inplace=True)

############################################################################################
# Адреса с '\' в названии (частоты)
backslash_atm_address_count = train['atm_address'][train['atm_address'].str.contains("\\\\") == True].value_counts()

# Замены до частоты 100 более-менее читаемых адресов
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[1], value='улица Маросейка, 3/13с1, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[9], value='улица Савушкина, 141, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[12], value='Новомытищинский проспект, 34, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[14], value='проспект Большевиков, 18, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[15], value='Марксистская улица, 1, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[16], value='Гражданский проспект, 41к2, Санкт-Петербург', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[18], value='улица Грекова, 8, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[23], value='Ракетный бульвар, 16, Москва', inplace=True)
train['atm_address'].replace(to_replace=backslash_atm_address_count.index[27], value='Новочеркасский проспект, 43/17, Санкт-Петербург', inplace=True)

# Остальные удалить
train.drop(train[train['atm_address'].str.contains("\\\\") == True].index, inplace=True)

############################################################################################
# Для удобства анализа сделаю labelencoding на customer_id и terminal_id
label_encoder = LabelEncoder()
train['customer_id'] = label_encoder.fit_transform(train['customer_id'])
train['terminal_id'] = label_encoder.fit_transform(train['terminal_id'])

#############################################################################################
# Обнуление NaN адресов atm и pos
train[['atm_address', 'atm_address_lat', 'atm_address_lon',
       'pos_address', 'pos_address_lat', 'pos_address_lon']] = train[['atm_address', 'atm_address_lat', 'atm_address_lon', 
                                                                      'pos_address', 'pos_address_lat', 'pos_address_lon']].fillna(0)
#######################################################################################################

# у 13 pos адресов координаты разные - исправляем
wrong_pos_address = []
with open('pos_address.txt', 'r') as file:
    for line in file:
        wrong_pos_address.append(line.strip())
        
geolocator = Yandex()

#0
location = geolocator.geocode('B. KAMENSCHIKI, 4MOSCOW')
#pos = train[train['pos_address'] == wrong_pos_address[0]]
#pos = pos.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})

train['pos_address_lat'][train['pos_address'] == wrong_pos_address[0]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[0]] = location.longitude

#1
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[1]] = 59.2243102
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[1]] = 39.8657153

#2 под вопросом, потому что обе точки имеют место быть
#train['pos_address_lat'][train['pos_address'] == wrong_pos_address[2]] = 55.7955715
#train['pos_address_lon'][train['pos_address'] == wrong_pos_address[2]] = 37.5937033

#3 
# тоже обе точки могут быть

#4
time.sleep(0.5)
location = geolocator.geocode("5-1A, SUSCHEVSKIY VALMOSKVA")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[4]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[4]] = location.longitude

#5
time.sleep(0.5)
location = geolocator.geocode("'KOMMUNISTICHESKAYA STR 1MYTISCHY")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[5]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[5]] = location.longitude

#6
time.sleep(0.5)
location = geolocator.geocode("MIRA STR 51MYTISCHY")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[6]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[6]] = location.longitude

#7
time.sleep(0.5)
location = geolocator.geocode("42, SCHUKINSKAYA MOSKVA")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[7]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[7]] = location.longitude

#8
#нормальная, отличие в 8 незначящем знаке

#9
time.sleep(0.5)
location = geolocator.geocode("2, VETERANOV MYTISCHI")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[9]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[9]] = location.longitude

#10
time.sleep(0.5)
location = geolocator.geocode("55 BOLSHOY SAMPSONIEVSKIY PR SANKT-PETERBURG")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[10]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[10]] = location.longitude

#11
time.sleep(0.5)
location = geolocator.geocode("20 BOLSHOY SAMPSONIEVSKIY PRSANKT-PETERBURG")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[11]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[11]] = location.longitude

#12
time.sleep(0.5)
location = geolocator.geocode("Счастливая улица, 11, Усады")
train['pos_address_lat'][train['pos_address'] == wrong_pos_address[12]] = location.latitude
train['pos_address_lon'][train['pos_address'] == wrong_pos_address[12]] = location.longitude

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pyd

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 625504 entries, 0 to 1224733
Data columns (total 18 columns):
amount              625504 non-null float64
atm_address         625504 non-null object
atm_address_lat     625504 non-null float64
atm_address_lon     625504 non-null float64
city                625501 non-null object
country             625504 non-null object
currency            625503 non-null float64
customer_id         625504 non-null int64
home_add_lat        625504 non-null float64
home_add_lon        625504 non-null float64
mcc                 625504 non-null int64
pos_address         625504 non-null object
pos_address_lat     625504 non-null float64
pos_address_lon     625504 non-null float64
terminal_id         625504 non-null int64
transaction_date    625503 non-null object
work_add_lat        625504 non-null float64
work_add_lon        625504 non-null float64
dtypes: float64(10), int64(3), object(5)
memory usage: 90.7+ MB


In [9]:
len(train['customer_id'].unique())

5158

In [4]:
train0 = train[train['customer_id'] == 0]

In [5]:
train0.shape

(99, 18)

In [17]:
print(len(train0['atm_address'].unique()))
train0['atm_address'].unique()

5


array([0, 'Новороссийск, ш. Сухумское, д. 17-А',
       'Новороссийск, ш. Анапское, д. 2',
       'Новороссийск, ул. Горького/Спортивная, д. 1, пос. Цемдолина',
       'Новороссийск, ул. Энгельса, д. 50'], dtype=object)

In [7]:
atm_lat0 = []
atm_lon0 = []
for address in train0['atm_address'].unique()[1:]:
    location = geolocator.geocode(address)
    location = geolocator.geocode(address)
    atm_lat0.append(location.latitude)
    atm_lon0.append(location.longitude)
    time.sleep(0.2)

In [8]:
atm_lat0, atm_lon0

([44.727645, 44.72541, 44.744235, 44.713593],
 [37.816064, 37.76292, 37.724148, 37.776781])

In [9]:
# записать координаты atm по адресу
i=0
for address in train0['atm_address'].unique()[1:]:
    train0['atm_address_lat'][train0['atm_address'] == address] = atm_lat0[i]
    train0['atm_address_lon'][train0['atm_address'] == address] = atm_lon0[i]
    i+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [32]:
# до
train0[train0['atm_address'] == train0['atm_address'].unique()[1]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
951882,2.76448,"Новороссийск, ш. Сухумское, д. 17-А",44.724,37.812,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-09-07,44.735,37.798
951903,4.467714,"Новороссийск, ш. Сухумское, д. 17-А",44.727,37.817,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-04-16,44.735,37.798
952027,4.328397,"Новороссийск, ш. Сухумское, д. 17-А",44.73,37.819,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-08-08,44.735,37.798
952029,4.399353,"Новороссийск, ш. Сухумское, д. 17-А",44.725,37.816,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-05-28,44.735,37.798
952030,4.441549,"Новороссийск, ш. Сухумское, д. 17-А",44.726,37.812,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-06-20,44.735,37.798
952031,4.449585,"Новороссийск, ш. Сухумское, д. 17-А",44.728,37.815,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-02-07,44.735,37.798
952032,4.096335,"Новороссийск, ш. Сухумское, д. 17-А",44.724,37.817,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-03-10,44.735,37.798
952034,4.332524,"Новороссийск, ш. Сухумское, д. 17-А",44.731,37.815,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-08-27,44.735,37.798
952035,3.603569,"Новороссийск, ш. Сухумское, д. 17-А",44.73,37.814,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-09-07,44.735,37.798
952039,3.304543,"Новороссийск, ш. Сухумское, д. 17-А",44.731,37.818,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-03-09,44.735,37.798


In [10]:
# после
train0[train0['atm_address'] == train0['atm_address'].unique()[1]]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
951882,2.76448,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-09-07,44.735,37.798
951903,4.467714,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-04-16,44.735,37.798
952027,4.328397,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-08-08,44.735,37.798
952029,4.399353,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-05-28,44.735,37.798
952030,4.441549,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-06-20,44.735,37.798
952031,4.449585,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-02-07,44.735,37.798
952032,4.096335,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-03-10,44.735,37.798
952034,4.332524,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-08-27,44.735,37.798
952035,3.603569,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-09-07,44.735,37.798
952039,3.304543,"Новороссийск, ш. Сухумское, д. 17-А",44.727645,37.816064,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,7962,2017-03-09,44.735,37.798


In [23]:
train0.groupby('terminal_id')

<pandas.core.groupby.DataFrameGroupBy object at 0x000001D7DC7BF550>

In [46]:
train0[(train0['atm_address'] ==0)]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
404048,2.771442,0,0.0,0.0,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-05-17,44.735,37.798
404049,3.015341,0,0.0,0.0,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-06-14,44.735,37.798
404050,2.361669,0,0.0,0.0,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-08-18,44.735,37.798
404051,2.973381,0,0.0,0.0,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-10-15,44.735,37.798
404052,3.182323,0,0.0,0.0,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,"TSEMDOLINA S., LENINA UL., D. 7 ZHNOVOROSSIYSK...",44.745536,37.726277,54342,2017-10-20,44.735,37.798
404053,3.143099,0,0.0,0.0,TSEMDOLINA,RUS,643.0,0,44.708,37.775,5211,22 A ZOLOTAYA RYBKA STRTSEMDOLINA353900 RUSRUS,44.769578,37.698697,85415,2017-09-09,44.735,37.798
404054,2.477034,0,0.0,0.0,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,31315,2017-10-19,44.735,37.798
404055,3.012767,0,0.0,0.0,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-04-11,44.735,37.798
404056,2.760114,0,0.0,0.0,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-07-20,44.735,37.798
404057,2.320913,0,0.0,0.0,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,SUHUMSKOE SHOSSE 16NOVOROSSIYSK353902 RUSRUS,44.711199,37.846624,110987,2017-06-11,44.735,37.798


In [11]:
location = geolocator.geocode("NOVOROSIYSK")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=3)
#folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
pos = train0
for i in range(pos.shape[0]):
    atm_lat = pos['atm_address_lat'].values[i]
    atm_lon = pos['atm_address_lon'].values[i]
    pos_lat = pos['pos_address_lat'].values[i]
    pos_lon = pos['pos_address_lon'].values[i]
    
    if (atm_lat or atm_lon) != 0: 
        folium.Marker([atm_lat, atm_lon], popup='atm' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)
    if (pos_lat or pos_lon) != 0:
        folium.Marker([pos_lat, pos_lon], popup='pos' + str(i), icon=folium.Icon(color='orange', icon='shopping-cart')).add_to(m)
    
h_lat = pos['home_add_lat'].values[0]
h_lon = pos['home_add_lon'].values[0]
w_lat = pos['work_add_lat'].values[0]
w_lon = pos['work_add_lon'].values[0]
folium.Marker([h_lat, h_lon], popup='<i>Home</i>', icon=folium.Icon(color='green', icon='home')).add_to(m)
folium.Marker([w_lat, w_lon], popup='<i>Work</i>', icon=folium.Icon(color='red', icon='briefcase')).add_to(m)
m

In [53]:
train0.iloc[27]

amount                                                2.40464
atm_address                                                 0
atm_address_lat                                             0
atm_address_lon                                             0
city                                                BORISOVKA
country                                                   RUS
currency                                                  643
customer_id                                                 0
home_add_lat                                           44.708
home_add_lon                                           37.775
mcc                                                      5411
pos_address         B-N CHAPAEVA STRBORISOVKA353960    RUSRUS
pos_address_lat                                       56.2513
pos_address_lon                                       43.4463
terminal_id                                             97913
transaction_date                                   2017-10-15
work_add

In [56]:
location = geolocator.geocode("NOVOROSIYSK")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=3)
pos = train0
pos_lat = pos['pos_address_lat'].values[27]
pos_lon = pos['pos_address_lon'].values[27]
folium.Marker([pos_lat, pos_lon], popup='atm' + str(27), icon=folium.Icon(color='blue', icon='usd')).add_to(m)
m

In [58]:
train0.groupby(['pos_address_lat', 'pos_address_lon']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,pos_address_lat,pos_address_lon,count
0,0.0,0.0,67
1,44.711199,37.846624,7
2,44.725683,37.762735,1
3,44.726727,37.755863,1
4,44.73772,37.793006,3
5,44.74386,37.724392,5
6,44.743984,37.725888,4
7,44.745536,37.726277,2
8,44.746243,37.718207,1
9,44.769578,37.698697,1


In [12]:
train0['terminal_id'].value_counts()

30783     49
7962      15
110994     7
51635      4
82615      4
127519     3
48987      3
31317      2
126489     2
97918      2
54345      1
120132     1
80065      1
94291      1
17642      1
85419      1
95902      1
34841      1
Name: terminal_id, dtype: int64

In [75]:
train0[train0['terminal_id'] == 30782]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1113922,4.463446,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-07-28,44.735,37.798
1113933,2.497193,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-08-09,44.735,37.798
1113958,3.035609,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-05-20,44.735,37.798
1113969,4.356204,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-10-06,44.735,37.798
1113976,3.914914,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-10-16,44.735,37.798
1113981,2.612532,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-10-24,44.735,37.798
1113984,4.205333,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-09-26,44.735,37.798
1113995,3.01542,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-07-02,44.735,37.798
1113998,3.101558,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-10-16,44.735,37.798
1114004,4.481342,"Новороссийск, ул. Горького/Спортивная, д. 1, п...",44.744235,37.724148,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,6011,0,0.0,0.0,30782,2017-04-07,44.735,37.798


In [80]:
train0['terminal_id'].value_counts().index.values

array([ 30782,   7962, 110987,  82611,  51632,  48984, 127512,  31315,
       126482,  97913,  54342,  94287,  80061,  17642,  85415,  95897,
        34839, 120125], dtype=int64)

In [81]:
train0[train0['terminal_id'].isin(train0['terminal_id'].value_counts().index.values)]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
404048,2.771442,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-05-17,44.735,37.798
404049,3.015341,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-06-14,44.735,37.798
404050,2.361669,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-08-18,44.735,37.798
404051,2.973381,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-10-15,44.735,37.798
404052,3.182323,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,"TSEMDOLINA S., LENINA UL., D. 7 ZHNOVOROSSIYSK...",44.745536,37.726277,54342,2017-10-20,44.735,37.798
404053,3.143099,0,0.000000,0.000000,TSEMDOLINA,RUS,643.0,0,44.708,37.775,5211,22 A ZOLOTAYA RYBKA STRTSEMDOLINA353900 RUSRUS,44.769578,37.698697,85415,2017-09-09,44.735,37.798
404054,2.477034,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,31315,2017-10-19,44.735,37.798
404055,3.012767,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51632,2017-04-11,44.735,37.798
404056,2.760114,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48984,2017-07-20,44.735,37.798
404057,2.320913,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,SUHUMSKOE SHOSSE 16NOVOROSSIYSK353902 RUSRUS,44.711199,37.846624,110987,2017-06-11,44.735,37.798


In [13]:
len(train0['terminal_id'].value_counts())

18

In [16]:
train0['terminal_id'].value_counts().index.values

array([ 30783,   7962, 110994,  51635,  82615, 127519,  48987,  31317,
       126489,  97918,  54345, 120132,  80065,  94291,  17642,  85419,
        95902,  34841], dtype=int64)

In [25]:
train0[train0['terminal_id'] == train0['terminal_id'].value_counts().index.values[0]]['atm_address_lat'].values[0]

44.744235000000003

In [27]:
lat0 = []
lon0 = []
for terminal in train0['terminal_id'].value_counts().index.values:
    atm_lat = train0[train0['terminal_id'] == terminal]['atm_address_lat'].values[0]
    atm_lon = train0[train0['terminal_id'] == terminal]['atm_address_lon'].values[0]
    pos_lat = train0[train0['terminal_id'] == terminal]['pos_address_lat'].values[0]
    pos_lon = train0[train0['terminal_id'] == terminal]['pos_address_lon'].values[0]
    if atm_lat !=0: 
        lat0.append(atm_lat)
        lon0.append(atm_lon)
    else:
        lat0.append(pos_lat)
        lon0.append(pos_lon)

In [43]:
print(range(18))

range(0, 18)


In [39]:
len(train0['terminal_id'].value_counts())

18

In [47]:
train0['terminal_id'].value_counts().values

array([49, 15,  7,  4,  4,  3,  3,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,
        1], dtype=int64)

In [58]:
d = {'terminal': train0['terminal_id'].value_counts().index.values, 
     'frequency':train0['terminal_id'].value_counts().values,
      'latitude':lat0, 'longitude':lon0}
freq = pd.DataFrame(data=d)
freq

Unnamed: 0,frequency,latitude,longitude,terminal
0,49,44.744235,37.724148,30783
1,15,44.727645,37.816064,7962
2,7,44.711199,37.846624,110994
3,4,44.743984,37.725888,51635
4,4,44.74386,37.724392,82615
5,3,44.73772,37.793006,127519
6,3,56.251347,43.446254,48987
7,2,56.251347,43.446254,31317
8,2,44.713593,37.776781,126489
9,2,56.251347,43.446254,97918


In [59]:
location = geolocator.geocode("NOVOROSIYSK")

m = folium.Map(location=[location.latitude, location.longitude], zoom_start=3)
#folium.Marker([location.latitude, location.longitude], popup='<i>True</i>', icon=folium.Icon(color='red', icon='font')).add_to(m)
pos = freq
for i in range(pos.shape[0]):
    lat = pos['latitude'].values[i]
    lon = pos['longitude'].values[i]
    
    folium.Marker([lat, lon], popup='atm' + str(i), icon=folium.Icon(color='blue', icon='usd')).add_to(m)
    
m

In [65]:
freq0.iloc[0][0]

49.0

In [60]:
freq0 = freq[['frequency', 'latitude', 'longitude']]
freq0

Unnamed: 0,frequency,latitude,longitude
0,49,44.744235,37.724148
1,15,44.727645,37.816064
2,7,44.711199,37.846624
3,4,44.743984,37.725888
4,4,44.74386,37.724392
5,3,44.73772,37.793006
6,3,56.251347,43.446254
7,2,56.251347,43.446254
8,2,44.713593,37.776781
9,2,56.251347,43.446254


In [61]:
freq0.shape

(18, 3)

In [63]:
freq0.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
frequency,49.0,15.0,7.0,4.0,4.0,3.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
latitude,44.744235,44.727645,44.711199,44.743984,44.74386,44.73772,56.251347,56.251347,44.713593,56.251347,44.745536,44.725683,44.726727,44.72541,44.74386,44.769578,44.745536,44.746243
longitude,37.724148,37.816064,37.846624,37.725888,37.724392,37.793006,43.446254,43.446254,37.776781,43.446254,37.726277,37.762735,37.755863,37.76292,37.724392,37.698697,37.726277,37.718207


In [67]:
freq_row = []
for i in range(freq0.shape[0]):
    for j in range(freq0.shape[1]):
        freq_row.append(freq0.iloc[i][j])

In [68]:
freq_row

[49.0,
 44.744235000000003,
 37.724148,
 15.0,
 44.727645000000003,
 37.816063999999997,
 7.0,
 44.711199399999998,
 37.846623800000003,
 4.0,
 44.743984099999999,
 37.725887999999998,
 4.0,
 44.743859800000003,
 37.724391670500005,
 3.0,
 44.737720099999997,
 37.793005999999998,
 3.0,
 56.2513468,
 43.446254145099999,
 2.0,
 56.2513468,
 43.446254145099999,
 2.0,
 44.713593000000003,
 37.776781,
 2.0,
 56.2513468,
 43.446254145099999,
 1.0,
 44.74553555,
 37.726277109499996,
 1.0,
 44.7256833,
 37.762734899999998,
 1.0,
 44.726726599999999,
 37.755863499999997,
 1.0,
 44.725409999999997,
 37.762920000000001,
 1.0,
 44.743859800000003,
 37.724391670500005,
 1.0,
 44.769578199999998,
 37.698697000000003,
 1.0,
 44.74553555,
 37.726277109499996,
 1.0,
 44.746242649999999,
 37.718207399999997]

In [69]:
h_lat = train0['home_add_lat'].values[0]
h_lon = train0['home_add_lon'].values[0]
w_lat = train0['work_add_lat'].values[0]
w_lon = train0['work_add_lon'].values[0]

In [75]:
h_lat, h_lon, w_lat, w_lon

(44.707999999999998,
 37.774999999999999,
 44.734999999999999,
 37.798000000000002)

In [76]:
from sklearn.tree import DecisionTreeRegressor

In [77]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 625504 entries, 0 to 1224733
Data columns (total 18 columns):
amount              625504 non-null float64
atm_address         625504 non-null object
atm_address_lat     625504 non-null float64
atm_address_lon     625504 non-null float64
city                625501 non-null object
country             625504 non-null object
currency            625503 non-null float64
customer_id         625504 non-null int64
home_add_lat        625504 non-null float64
home_add_lon        625504 non-null float64
mcc                 625504 non-null int64
pos_address         625504 non-null object
pos_address_lat     625504 non-null float64
pos_address_lon     625504 non-null float64
terminal_id         625504 non-null int64
transaction_date    625503 non-null object
work_add_lat        625504 non-null float64
work_add_lon        625504 non-null float64
dtypes: float64(10), int64(3), object(5)
memory usage: 90.7+ MB


In [86]:
uniq_term = train.groupby('customer_id')['terminal_id'].unique()

In [83]:
train.groupby('customer_id')['terminal_id'].unique()[0]

array([ 51635,  48987,  54345,  85419,  31317, 110994,  34841, 127519,
        82615,  17642,  95902,  97918,  80065, 120132,   7962,  94291,
        30783, 126489], dtype=int64)

In [96]:
l = []
for i in tqdm(range(len(uniq_term))):
    l.append(len(uniq_term[i]))


  0%|                                                                                         | 0/5158 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5158/5158 [00:00<00:00, 75236.38it/s]

In [98]:
np.max(l)

313

In [78]:
train.groupby('customer_id')['terminal_id'].value_counts()

customer_id  terminal_id
0            30783          49
             7962           15
             110994          7
             51635           4
             82615           4
             48987           3
             127519          3
             31317           2
             97918           2
             126489          2
             17642           1
             34841           1
             54345           1
             80065           1
             85419           1
             94291           1
             95902           1
             120132          1
1            82380           3
             133592          3
             9347            2
             35630           2
             47210           2
             103243          2
             133033          2
             235             1
             15141           1
             15280           1
             34367           1
             42491           1
                            ..
5157         6

In [99]:
train0

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
404048,2.771442,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-05-17,44.735,37.798
404049,3.015341,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-06-14,44.735,37.798
404050,2.361669,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-08-18,44.735,37.798
404051,2.973381,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-10-15,44.735,37.798
404052,3.182323,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,"TSEMDOLINA S., LENINA UL., D. 7 ZHNOVOROSSIYSK...",44.745536,37.726277,54345,2017-10-20,44.735,37.798
404053,3.143099,0,0.000000,0.000000,TSEMDOLINA,RUS,643.0,0,44.708,37.775,5211,22 A ZOLOTAYA RYBKA STRTSEMDOLINA353900 RUSRUS,44.769578,37.698697,85419,2017-09-09,44.735,37.798
404054,2.477034,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,31317,2017-10-19,44.735,37.798
404055,3.012767,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-04-11,44.735,37.798
404056,2.760114,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-07-20,44.735,37.798
404057,2.320913,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,SUHUMSKOE SHOSSE 16NOVOROSSIYSK353902 RUSRUS,44.711199,37.846624,110994,2017-06-11,44.735,37.798


In [100]:
train0

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
404048,2.771442,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-05-17,44.735,37.798
404049,3.015341,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-06-14,44.735,37.798
404050,2.361669,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-08-18,44.735,37.798
404051,2.973381,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-10-15,44.735,37.798
404052,3.182323,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,"TSEMDOLINA S., LENINA UL., D. 7 ZHNOVOROSSIYSK...",44.745536,37.726277,54345,2017-10-20,44.735,37.798
404053,3.143099,0,0.000000,0.000000,TSEMDOLINA,RUS,643.0,0,44.708,37.775,5211,22 A ZOLOTAYA RYBKA STRTSEMDOLINA353900 RUSRUS,44.769578,37.698697,85419,2017-09-09,44.735,37.798
404054,2.477034,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,31317,2017-10-19,44.735,37.798
404055,3.012767,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5541,"POS. CEMDOLINA, ST. LENINA, 2NOVOROSSIYSK35396...",44.743984,37.725888,51635,2017-04-11,44.735,37.798
404056,2.760114,0,0.000000,0.000000,BORISOVKA,RUS,643.0,0,44.708,37.775,5411,B-N CHAPAEVA STRBORISOVKA353960 RUSRUS,56.251347,43.446254,48987,2017-07-20,44.735,37.798
404057,2.320913,0,0.000000,0.000000,NOVOROSSIYSK,RUS,643.0,0,44.708,37.775,5411,SUHUMSKOE SHOSSE 16NOVOROSSIYSK353902 RUSRUS,44.711199,37.846624,110994,2017-06-11,44.735,37.798
