In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from feature_engine.outliers import Winsorizer

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sample = pd.read_csv('SampleSubmission.csv')

In [3]:
df = pd.concat([train, test], axis=0)

In [4]:
win0 = Winsorizer(capping_method='iqr', tail='right', fold=1.5, variables=['ghsl_water_surface', 'ghsl_built_1975_to_1990', 'ghsl_built_1990_to_2000', 'ghsl_built_2000_to_2014', 'ghsl_built_pre_1975', 'ghsl_pop_density', 'landcover_crops_fraction', 'landcover_urban_fraction', 'landcover_water_permanent_10km_fraction', 'landcover_water_seasonal_10km_fraction', 'nighttime_lights', 'dist_to_capital', 'dist_to_shoreline'])
df= win0.fit_transform(df)

In [5]:
win1 = Winsorizer(capping_method='iqr', tail='left', fold=1.5, variables=['ghsl_not_built_up'])
df = win1.fit_transform(df)

In [6]:
df['ghsl_earth_area'] = 1 - df.ghsl_water_surface

In [7]:
df['built_on_land'] = df.ghsl_earth_area - (df.ghsl_built_1975_to_1990 + df.ghsl_built_1990_to_2000 + df.ghsl_built_2000_to_2014 + df.ghsl_built_pre_1975)

In [8]:
df['id_by_country_cnt']= df.groupby('country')['ID'].transform('count')

In [9]:
country_freq= df.country.value_counts(normalize=True).to_dict()
df['country_freq']= df.country.map(country_freq)

In [10]:
df['ghsl_water_surface_per_10km'] = df.ghsl_water_surface * 2
df['water']= df.landcover_water_permanent_10km_fraction + df.landcover_water_seasonal_10km_fraction

In [11]:
df['total_built_up']= df.ghsl_built_pre_1975 + df.ghsl_built_1975_to_1990 + df.ghsl_built_1990_to_2000 + df.ghsl_built_2000_to_2014

In [12]:
df['land_dividation'] = df.landcover_crops_fraction + df.landcover_urban_fraction + df.landcover_water_permanent_10km_fraction + df.landcover_water_seasonal_10km_fraction

In [13]:
df['Total_distance'] = df.dist_to_capital + df.dist_to_shoreline

In [14]:
urban= pd.get_dummies(df.urban_or_rural, drop_first=True, prefix= 'urban_or_rural_')
df = df.drop('urban_or_rural', axis= 1)
df = pd.concat([df, urban], axis= 1)

In [15]:
df['landcover_water_permanent_5km_fraction'] = df['landcover_water_permanent_10km_fraction']/2
df['landcover_water_seasonal_5km_fraction'] = df['landcover_water_seasonal_10km_fraction']/2

In [16]:
df['dist_to_capital_in_miles'] = df.dist_to_capital * 0.62137119
df['dist_to_shoreline_in_miles'] = df.dist_to_shoreline * 0.62137119

In [17]:
train = df[:21454]
test = df[21454:]

In [18]:
[var for var in train.columns if train[var].dtypes != 'O']

['year',
 'ghsl_water_surface',
 'ghsl_built_pre_1975',
 'ghsl_built_1975_to_1990',
 'ghsl_built_1990_to_2000',
 'ghsl_built_2000_to_2014',
 'ghsl_not_built_up',
 'ghsl_pop_density',
 'landcover_crops_fraction',
 'landcover_urban_fraction',
 'landcover_water_permanent_10km_fraction',
 'landcover_water_seasonal_10km_fraction',
 'nighttime_lights',
 'dist_to_capital',
 'dist_to_shoreline',
 'Target',
 'ghsl_earth_area',
 'built_on_land',
 'id_by_country_cnt',
 'country_freq',
 'ghsl_water_surface_per_10km',
 'water',
 'total_built_up',
 'land_dividation',
 'Total_distance',
 'urban_or_rural__U',
 'landcover_water_permanent_5km_fraction',
 'landcover_water_seasonal_5km_fraction',
 'dist_to_capital_in_miles',
 'dist_to_shoreline_in_miles']

In [19]:
cont_cols = ['year',
 'ghsl_water_surface',
 'ghsl_built_pre_1975',
 'ghsl_built_1975_to_1990',
 'ghsl_built_1990_to_2000',
 'ghsl_built_2000_to_2014',
 'ghsl_not_built_up',
 'ghsl_pop_density',
 'landcover_crops_fraction',
 'landcover_urban_fraction',
 'landcover_water_permanent_10km_fraction',
 'landcover_water_seasonal_10km_fraction',
 'nighttime_lights',
 'dist_to_capital',
 'dist_to_shoreline',
 'ghsl_earth_area',
 'built_on_land',
 'id_by_country_cnt',
 'country_freq',
 'ghsl_water_surface_per_10km',
 'water',
 'total_built_up',
 'land_dividation',
 'Total_distance',
 'urban_or_rural__U',
 'landcover_water_permanent_5km_fraction',
 'landcover_water_seasonal_5km_fraction',
 'dist_to_capital_in_miles',
 'dist_to_shoreline_in_miles']

In [20]:
X_train = train[cont_cols]
y_train = train.Target
X_test = test[cont_cols]

In [None]:
n_folds = 10
subbed = []
kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)


for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
    print('=============== Fold No:',fold+1,'===============')
    X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    model = CatBoostRegressor(n_estimators=2000,eval_metric= 'RMSE', random_state=10)
    model.fit(X_tr, y_tr,eval_set=[(X_tst, y_tst)], early_stopping_rounds=50, verbose=500)
    print(mean_squared_error(y_tst, np.round(model.predict(X_tst), decimals=6), squared=False))
    subbed.append(mean_squared_error(y_tst, model.predict(X_tst), squared=False))
    pred = model.predict(X_test)
print(np.mean(subbed))

Learning rate set to 0.052354
0:	learn: 0.1878009	test: 0.1870534	best: 0.1870534 (0)	total: 150ms	remaining: 4m 59s
500:	learn: 0.0794573	test: 0.0869724	best: 0.0869647 (499)	total: 6.6s	remaining: 19.8s
1000:	learn: 0.0729981	test: 0.0855013	best: 0.0855013 (1000)	total: 13.3s	remaining: 13.3s
1500:	learn: 0.0683178	test: 0.0849207	best: 0.0849177 (1480)	total: 19.6s	remaining: 6.51s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.08488554826
bestIteration = 1532

Shrink model to first 1533 iterations.
0.08488555739223161
Learning rate set to 0.052354
0:	learn: 0.1877660	test: 0.1866419	best: 0.1866419 (0)	total: 19ms	remaining: 37.9s
500:	learn: 0.0792812	test: 0.0861641	best: 0.0861525 (498)	total: 6.44s	remaining: 19.3s
1000:	learn: 0.0727999	test: 0.0853138	best: 0.0853138 (1000)	total: 13s	remaining: 12.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.08527334835
bestIteration = 1052

Shrink model to first 1053 iterations.
0.08527336018

In [None]:
predict= np.round(pred, decimals=6)

In [None]:
sub = pd.DataFrame()
sub['ID'] = test['ID']
sub['Target'] = predict
sub.to_csv('cat_output.csv', index= False)