In [None]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [None]:
train = pd.read_csv('train.csv', index_col=0)

In [None]:
test = pd.read_csv('test.csv', index_col=0)

In [None]:
train['isTrain'] = True
test['isTrain'] = False

In [None]:
X = train.append(test, sort=False)

In [None]:
X.describe().T

In [None]:
X.describe(include='object').T

### Количество банкоматов стоящих по одному адресу

In [None]:
counts = X.groupby('address_rus')['id'].count().reset_index().rename(columns={'id':'count'})

In [None]:
X = pd.merge(X, counts, how='left', on='address_rus')

In [None]:
X.head()

### Расстояния до ближайших банкоматов

In [None]:
R = 6373.0 # радиус земли в километрах

def distance(x,y):
    """
    Параметры
    ----------
    x : tuple, широта и долгота первой геокоординаты 
    y : tuple, широта и долгота второй геокоординаты 
    
    Результат
    ----------
    result : дистанция в километрах между двумя геокоординатами
    """
    lat_a, long_a, lat_b, long_b = map(radians, [*x,*y])    
    dlon = long_b - long_a
    dlat = lat_b - lat_a
    a = sin(dlat/2)**2 + cos(lat_a) * cos(lat_b) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

In [None]:
knc = KNeighborsClassifier(metric=distance)

In [None]:
dots = X[['lat','long']].dropna()

In [None]:
knc.fit(X=dots , y=np.ones(dots.shape[0]))

In [None]:
distances, indexes = knc.kneighbors(X=dots,n_neighbors=6,)

In [None]:
for i in range(1,6):
    dots['distance_%s'%i] = distances[:,i]
    dots['indexes_%s'%i] = indexes[:,i]

In [None]:
dots['mean'] = dots.iloc[:,dots.columns.str.contains('distance')].mean(axis=1)

In [None]:
X = pd.concat([X,dots], axis=1)

### Город размещения

In [None]:
X['city'] = X[~X.address_rus.isnull()].address_rus.apply(lambda x: x.split(',')[2]) 

In [None]:
rare_cities = X.city.value_counts()[(X.city.value_counts() < 20) ==True].index

In [None]:
X.city = X.city.apply(lambda x: 'RARE' if x in rare_cities else x)

In [None]:
X.city= X.city.rank().fillna(-1)

In [None]:
X_ = X[X.isTrain][['atm_group', 'lat', 'long',  'count', 'distance_1',
                    'distance_2',  'distance_3',  'distance_4', 'distance_5',
                    'indexes_5', 'mean', 'city']]
Y_ = X.loc[X.isTrain, 'target']

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_, Y_, test_size=0.25, random_state=1)

In [None]:
gbm = lgb.LGBMRegressor(objective = 'regression',  
                            max_depth = 3,
                            colsample_bytre = 0.8,
                            subsample = 0.8, 
                            learning_rate = 0.1,
                            n_estimators = 300)

In [None]:
gbm.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], eval_metric='rmse', early_stopping_rounds=5)

In [None]:
%matplotlib inline
lgb.plot_importance(gbm)

In [None]:
rmse(Y_valid, gbm.predict(X_valid))

In [None]:
rmse(Y_valid, np.zeros(Y_valid.shape[0]))

In [None]:
X_test = X[~X.isTrain][['atm_group', 'lat', 'long',  'count', 'distance_1',
                        'distance_2', 'distance_3',  'distance_4', 'distance_5',
                        'indexes_5', 'mean', 'city']]

In [None]:
submit = pd.DataFrame(gbm.predict(X_test), index=test.index,columns=['target'])

In [None]:
submit.to_csv('submit.csv')