In [1]:
import pandas as pd
import numpy as np
import xgboost
import utils
import pickle

In [32]:
#Версии модулей при обучении и сохранении модели
# pandas 0.23.4
# numpy 1.14.0
# xgboost 0.7.post4
# pickle 4.0
print(pd.__version__)
print(np.__version__)
print(xgboost.__version__)
print(pickle.format_version)

0.23.4
1.14.0
0.7.post4
4.0


In [2]:
def open_data():
    data = pd.read_csv('train_part_1_v2.csv.gz', compression='gzip',
                        error_bad_lines=False)
    data2 = pd.read_csv('train_part_2_v2.csv.gz', compression='gzip',
                        error_bad_lines=False)
    train = pd.concat([data, data2], axis=0).reset_index(drop=True)
    test = pd.read_csv('test_public_v2.csv.gz', compression='gzip',
                        error_bad_lines=False)
    return train, test

train, test = open_data()


In [3]:
BASE_FEATURES_SET = ['PT', 'P', 'NShared', 
                     'ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 
                     'MatchedHit_DX[1]', 'MatchedHit_DX[2]', 'MatchedHit_DX[3]', 
                     'Mextra_DX2[3]', 'MatchedHit_Y[0]']
ADDITIONAL_FEATURES_SET = ['MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]', 'MatchedHit_X[3]',
                           'MatchedHit_Y[1]', 'MatchedHit_Y[2]', 'MatchedHit_Y[3]',
                           'Lextra_X[0]', 'Lextra_X[1]', 'Lextra_X[2]', 'Lextra_X[3]',
                           'Lextra_Y[0]', 'Lextra_Y[1]', 'Lextra_Y[2]', 'Lextra_Y[3]']
GEN_FEATURES_SET = ['distx0', 'distx1', 'distx2', 'distx3', 'disty0', 'disty1', 'disty2', 'disty3',
                    'Numcrossed'
                   ]

FULL_FEATURES_SET = BASE_FEATURES_SET + GEN_FEATURES_SET
READ_FEATURES_SET = BASE_FEATURES_SET + ADDITIONAL_FEATURES_SET

In [4]:
train['Numcrossed'] = train[['MatchedHit_TYPE[0]',
                                      'MatchedHit_TYPE[1]',
                                      'MatchedHit_TYPE[2]',
                                      'MatchedHit_TYPE[3]']].sum(axis=1)/2
train['distx0'] = (train['MatchedHit_X[0]'] - train['Lextra_X[0]']) ** 2
train['distx1'] = (train['MatchedHit_X[1]'] - train['Lextra_X[1]']) ** 2
train['distx2'] = (train['MatchedHit_X[2]'] - train['Lextra_X[2]']) ** 2
train['distx3'] = (train['MatchedHit_X[3]'] - train['Lextra_X[3]']) ** 2
train['disty0'] = (train['MatchedHit_Y[0]'] - train['Lextra_Y[0]']) ** 2
train['disty1'] = (train['MatchedHit_Y[1]'] - train['Lextra_Y[1]']) ** 2
train['disty2'] = (train['MatchedHit_Y[2]'] - train['Lextra_Y[2]']) ** 2
train['disty3'] = (train['MatchedHit_Y[3]'] - train['Lextra_Y[3]']) ** 2

In [13]:
test['Numcrossed'] = test[['MatchedHit_TYPE[0]',
                                      'MatchedHit_TYPE[1]',
                                      'MatchedHit_TYPE[2]',
                                      'MatchedHit_TYPE[3]']].sum(axis=1)/2
test['distx0'] = (test['MatchedHit_X[0]'] - test['Lextra_X[0]']) ** 2
test['distx1'] = (test['MatchedHit_X[1]'] - test['Lextra_X[1]']) ** 2
test['distx2'] = (test['MatchedHit_X[2]'] - test['Lextra_X[2]']) ** 2
test['distx3'] = (test['MatchedHit_X[3]'] - test['Lextra_X[3]']) ** 2
test['disty0'] = (test['MatchedHit_Y[0]'] - test['Lextra_Y[0]']) ** 2
test['disty1'] = (test['MatchedHit_Y[1]'] - test['Lextra_Y[1]']) ** 2
test['disty2'] = (test['MatchedHit_Y[2]'] - test['Lextra_Y[2]']) ** 2
test['disty3'] = (test['MatchedHit_Y[3]'] - test['Lextra_Y[3]']) ** 2

In [5]:
model_track_2 = xgboost.XGBClassifier(n_estimators=100, max_depth=7, n_jobs=1).fit(
    train.loc[:, FULL_FEATURES_SET].values, train.label, sample_weight=train.weight)


In [6]:
model_track_2

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [None]:
filename = 'model_Num.pickle'
pickle.dump(model_track_2, open(filename, 'wb'))