In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
ATTR_PATH = 'data/attr.csv'
TEST_PATH = 'new_submit.csv'
CATBOOST_PATH = 'cboost_model_1'
LGBM_PATH = 'lgbm_model_1.pkl'
SAVE_TO_C = 'submit_cat.csv.gz'
SAVE_TO_L = 'submit_lgbm.csv.gz'
SAVE_TO_E = 'submit_ensemble.csv.gz'

RANDOM_STATE = 42

In [3]:
sns.set()
tqdm.pandas()

model_L = pickle.load(open(LGBM_PATH, 'rb'))

model_C = CatBoostRegressor() 
model_C.load_model(CATBOOST_PATH)

<catboost.core.CatBoostRegressor at 0x105bee250>

In [4]:
def make_predicts(model, joined_v, df, save_to):
    predicts = model.predict(joined_v.drop(columns='x1'))
    pred_series = pd.Series(predicts, index=df.index)
    df['x1'] = pred_series
    df = df[['ego_id', 'u', 'v', 'x1']]
    df.to_csv(save_to, index=False, compression='gzip')
    return predicts

In [5]:
df = pd.read_csv(TEST_PATH)
attrs = pd.read_csv(ATTR_PATH)

In [6]:
joined_u = pd.merge(df, attrs, left_on=['ego_id', 'u'], right_on=['ego_id', 'u'], how='left')
joined_u

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age,city_id,sex,school,university
0,8,0,93,359.6,,0.000000,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0
1,8,0,143,6.1,,0.000000,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0
2,8,0,151,0.2,,1.386294,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0
3,8,1,24,594.5,,0.000000,0.0,120.0,56833659.0,1.0,370230497.0,779615128.0
4,8,5,4,461.5,,0.000000,0.0,37.0,979281502.0,1.0,814552332.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
810971,1709396984676,73,23,20.3,,0.000000,0.0,122.0,-1.0,1.0,405077490.0,-1.0
810972,1709396984676,74,68,46.7,,0.000000,0.0,20.0,-1.0,2.0,-1.0,-1.0
810973,1709396984676,77,28,43.4,,0.000000,0.0,16.0,926522633.0,1.0,43512792.0,112540362.0
810974,1709396984676,79,38,50.2,,0.000000,0.0,16.0,269576388.0,2.0,566558003.0,-1.0


In [7]:
joined_v = pd.merge(joined_u, attrs, left_on=['ego_id', 'v'], right_on=['ego_id', 'u'], how='left', suffixes=('_u', '_v'))
joined_v

Unnamed: 0,ego_id,u_u,v,t,x1,x2,x3,age_u,city_id_u,sex_u,school_u,university_u,u_v,age_v,city_id_v,sex_v,school_v,university_v
0,8,0,93,359.6,,0.000000,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,93.0,36.0,979281502.0,2.0,734952557.0,566091832.0
1,8,0,143,6.1,,0.000000,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,143.0,43.0,979281502.0,2.0,-1.0,-1.0
2,8,0,151,0.2,,1.386294,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,151.0,18.0,979281502.0,2.0,-1.0,-1.0
3,8,1,24,594.5,,0.000000,0.0,120.0,56833659.0,1.0,370230497.0,779615128.0,24.0,36.0,104874069.0,2.0,213987831.0,562436811.0
4,8,5,4,461.5,,0.000000,0.0,37.0,979281502.0,1.0,814552332.0,-1.0,4.0,37.0,-1.0,1.0,213987831.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810971,1709396984676,73,23,20.3,,0.000000,0.0,122.0,-1.0,1.0,405077490.0,-1.0,23.0,18.0,-1.0,1.0,-1.0,-1.0
810972,1709396984676,74,68,46.7,,0.000000,0.0,20.0,-1.0,2.0,-1.0,-1.0,68.0,17.0,338248086.0,2.0,485661706.0,-1.0
810973,1709396984676,77,28,43.4,,0.000000,0.0,16.0,926522633.0,1.0,43512792.0,112540362.0,28.0,14.0,338248086.0,1.0,253082810.0,-1.0
810974,1709396984676,79,38,50.2,,0.000000,0.0,16.0,269576388.0,2.0,566558003.0,-1.0,38.0,112.0,-1.0,1.0,253082810.0,-1.0


In [8]:
joined_v = joined_v[[
    'ego_id', 'u_u', 'v', 't', 'x1', 'x2', 'x3', 'age_u', 'city_id_u',
    'sex_u', 'school_u', 'university_u', 'age_v', 'city_id_v',
    'sex_v', 'school_v', 'university_v'
]]
joined_v.columns = [
    'ego_id', 'u', 'v', 't', 'x1', 'x2', 'x3', 'age_u', 'city_id_u',
    'sex_u', 'school_u', 'university_u', 'age_v', 'city_id_v',
    'sex_v', 'school_v', 'university_v'
]

In [9]:
predicts_L = make_predicts(model_L, joined_v, df, SAVE_TO_L)
predicts_С = make_predicts(model_C, joined_v, df, SAVE_TO_C)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [10]:
predicts_E = np.add(predicts_L, predicts_С) / 2

pred_series_ENSEMBLE = pd.Series(predicts_E, index=df.index)

df['x1'] = pred_series_ENSEMBLE
df = df[['ego_id', 'u', 'v', 'x1']]

df.to_csv(SAVE_TO_E, index=False, compression='gzip')