In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install catboost geopandas shapely -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## Импорт библиотек

In [55]:
import pandas as pd
import numpy as np

import geopandas as gpd
from shapely.geometry import Point

from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import pickle
import json

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
!unzip '/content/gdrive/MyDrive/data/train_dataset_train_data_Mediawise.zip'

Archive:  /content/gdrive/MyDrive/data/train_dataset_train_data_Mediawise.zip
   creating: train_data_Mediawise/
  inflating: train_data_Mediawise/readme.md  
  inflating: __MACOSX/train_data_Mediawise/._readme.md  
  inflating: train_data_Mediawise/baseline.ipynb  
  inflating: __MACOSX/train_data_Mediawise/._baseline.ipynb  
  inflating: train_data_Mediawise/train_data.json  


## Предобработка данных

In [56]:
df=pd.read_json('/content/train_data_Mediawise/train_data.json')
df=pd.concat([df,pd.json_normalize(df['targetAudience'])], axis=1)
df=df.drop(['targetAudience','id'], axis=1)

In [57]:
# Функция для перевода из азимута в стороны света
def get_direction(angle):
    directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N']
    direction_index = round(angle / 45) % 8
    return directions[direction_index]

In [58]:
# функция для корректного переименовывания групп
def rename_name(row):
    gender = ''
    age_from = row['ageFrom']
    age_to = row['ageTo']
    income = row['income'].upper()

    if row['gender'] == 'all':
        gender = 'All'
    else:
        gender = row['gender'][0].upper()

    if gender == 'All' and age_from == 18 and age_to == 100 and income == 'ABC':
        return f'All 18+'
    elif age_to == 100:
        return f'{gender} {age_from}+ {income}'
    else:
        return f'{gender} {age_from}-{age_to} {income}'

In [59]:
df['name'] = df.apply(rename_name, axis=1)

In [60]:
gdf = gpd.read_file('moscow.geojson')

In [None]:
gdf

Unnamed: 0,district,geometry
0,район Богородское,"POLYGON ((37.67021 55.83874, 37.67023 55.83865..."
1,район Вешняки,"POLYGON ((37.76730 55.73427, 37.76752 55.73415..."
2,район Восточное Измайлово,"POLYGON ((37.79943 55.79324, 37.79943 55.79322..."
3,район Восточный,"MULTIPOLYGON (((37.84533 55.81356, 37.84981 55..."
4,район Гольяново,"POLYGON ((37.74593 55.80903, 37.74604 55.80698..."
...,...,...
122,район Чертаново Центральное,"POLYGON ((37.56980 55.60807, 37.56988 55.60772..."
123,район Чертаново Южное,"POLYGON ((37.57294 55.58097, 37.57814 55.57973..."
124,район Старое Крюково,"POLYGON ((37.17327 55.98044, 37.17374 55.98014..."
125,Новомосковский административный округ,"POLYGON ((37.08720 55.59048, 37.08750 55.59055..."


In [61]:
# Функция для определения полигона
def check_point_in_polygon(lat, lon, polygons):
    point = Point(lon, lat)
    for poly_name, polygon in zip(polygons['district'], polygons['geometry']):
        if polygon.contains(point):
            return poly_name
    return None

In [62]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

for poly_name in gdf['district']:
    df[poly_name] = 0

for idx, row in df.iterrows():
    points = row['points']
    for point in points:
        lat, lon, azimut = point['lat'], point['lon'], point['azimuth']
        poly_name = check_point_in_polygon(lat, lon, gdf)
        if poly_name:
            df.at[idx, poly_name] += 1
            df['dir'] = get_direction(azimut)

In [None]:
df.sample(1)

Unnamed: 0,hash,points,value,name,gender,ageFrom,ageTo,income,район Богородское,район Вешняки,...,район Орехово-Борисово Северное,район Орехово-Борисово Южное,район Царицыно,район Чертаново Северное,район Чертаново Центральное,район Чертаново Южное,район Старое Крюково,Новомосковский административный округ,Троицкий административный округ,dir
943,0bd3d0dbf6a441d1,"[{'lat': '55.763692787422', 'lon': '37.6853880...",51.51,All 18-55 ABC,all,18,55,abc,1,2,...,0,2,1,2,2,0,1,0,0,NE


## Обучение и валидация модели

In [63]:
y = df.value
X = df.drop(['value', 'hash', 'points', 'name'], axis=1)
cat_features = ['gender','income', 'dir']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [129]:
model = CatBoostRegressor(iterations=3000,
                          depth=9,
                          learning_rate=0.04,
                          l2_leaf_reg=7,
                          colsample_bylevel=0.06,
                          loss_function='RMSE')

model.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          cat_features=cat_features,
          verbose=100,
          early_stopping_rounds=100)

0:	learn: 23.7816690	test: 21.8924062	best: 21.8924062 (0)	total: 11.2ms	remaining: 33.6s
100:	learn: 12.4903563	test: 11.5963888	best: 11.5963888 (100)	total: 935ms	remaining: 26.8s
200:	learn: 10.0745623	test: 10.0834558	best: 10.0834558 (200)	total: 1.3s	remaining: 18.2s
300:	learn: 8.5398751	test: 9.3903405	best: 9.3903405 (300)	total: 1.54s	remaining: 13.8s
400:	learn: 7.5847736	test: 9.1066668	best: 9.1066668 (400)	total: 1.78s	remaining: 11.5s
500:	learn: 6.9475505	test: 8.9120215	best: 8.9120215 (500)	total: 2s	remaining: 9.99s
600:	learn: 6.5201936	test: 8.7970932	best: 8.7970932 (600)	total: 2.23s	remaining: 8.92s
700:	learn: 6.1938503	test: 8.7115170	best: 8.7106495 (695)	total: 2.48s	remaining: 8.14s
800:	learn: 5.9123001	test: 8.6351187	best: 8.6351187 (800)	total: 2.72s	remaining: 7.46s
900:	learn: 5.6694048	test: 8.5862707	best: 8.5861232 (899)	total: 2.96s	remaining: 6.88s
1000:	learn: 5.4777202	test: 8.5450788	best: 8.5442372 (997)	total: 3.19s	remaining: 6.38s
1100:	l

<catboost.core.CatBoostRegressor at 0x7a0301a71f90>

In [130]:
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
custom = max(1 - rmse/30, 0) ** 4.

print(f'RMSE: {rmse:.4f}')
print(f'R²: {r2:.4f}')
print(f'MAE: {mae:.4f}')
print(f'Custom: {custom:.4f}')

RMSE: 8.3795
R²: 0.8566
MAE: 5.4750
Custom: 0.2698


In [131]:
filename = 'coords_model.cb'
pickle.dump(model, open(filename, 'wb'))

## Предикты для тестового датасета

In [134]:
filepath = '/content/test_data.json'
geojson_path = '/content/moscow.geojson'
sample_submission_path = '/content/sample_submission.csv'

def get_preds(filepath, geojson_path, sample_submission_path):
    test = pd.read_json(filepath)
    gdf = gpd.read_file(geojson_path)
    test=pd.concat([test,pd.json_normalize(test['targetAudience'])], axis=1)
    test=test.drop(['targetAudience','id'], axis=1)

    for poly_name in gdf['district']:
            test[poly_name] = 0

    for idx, row in test.iterrows():
        points = row['points']
        for point in points:
            lat, lon, azimut = point['lat'], point['lon'], point['azimuth']
            poly_name = check_point_in_polygon(lat, lon, gdf)
            if poly_name:
                test.at[idx, poly_name] += 1
                test['dir'] = get_direction(azimut)

    test.drop(['points', 'name', 'hash'], axis=1, inplace=True)
    loaded_model = pickle.load(open('/content/coords_model.cb', 'rb'))
    test_pred = loaded_model.predict(test)
    for i in range(test_pred):
        if test_pred[i] < 0:
            test_pred[i] = 0
    submission = pd.read_csv(sample_submission_path)
    submission.drop(['value'], axis=1, inplace=True)
    submission['value'] = test_pred
    submission.to_csv('submission1.csv', sep =',', lineterminator='\n', index=False)

In [135]:
get_preds(filepath, geojson_path, sample_submission_path)