* [Preparation Functions](#Preparation-Functions)
* [Prepare data](#Prepare-data)
* [data Normalization](#data-Normalization)
* [Preprocessing](#Preprocessing)
* [RandomForest Test](#RandomForest-Test)
* [DNN model](#DNN-model)
* [Cross Validation](#Cross-Validation)

In [1]:
import pandas as pd
import numpy as np
import scipy as scp
from scipy import stats
import sklearn 
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.noise import GaussianDropout
from keras.layers.normalization import BatchNormalization
from keras.callbacks import LearningRateScheduler
from keras.optimizers import SGD,RMSprop,Adam,Adadelta
from keras.utils import np_utils
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

Using TensorFlow backend.


## Preparation Functions

In [2]:
def label_encode(X):
    for c in X.columns:
        if X[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(X[c].values)) 
            X[c] = lbl.transform(list(X[c].values))
    return X

def impute(X):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    cols = X.columns
    X_imp = pd.DataFrame(imp.fit_transform(X))
    X_imp.columns = cols
    return X_imp

def Normalize(X):
    cols = X.columns
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    X = pd.DataFrame(scaler.transform(X))
    X.columns = cols
    return X, scaler

## Prepare data

In [3]:
df_train = pd.read_csv('./data/Sber Bank/train.csv')
train_cols = df_train.drop(['id', 'price_doc'], axis=1).columns

df_test = pd.read_csv('./data/Sber Bank/test.csv')

df_macro = pd.read_csv('./data/Sber Bank/macro.csv')
macro_cols = df_macro.columns

df_train_all = pd.merge(df_train, df_macro, on='timestamp')
df_test_all = pd.merge(df_test, df_macro, on='timestamp')

# keep 'timestamp' column in separate and drop it from dataset
df_train_all.timestamp = pd.to_datetime(df_train.timestamp)
df_train_timestamp = df_train.timestamp
df_train_all = df_train_all.drop(['id', 'timestamp'], axis=1)

df_test_all.timestamp = pd.to_datetime(df_test.timestamp)
df_trest_timestamp = df_test.timestamp
df_test_all = df_test_all.drop(['id', 'timestamp'], axis=1)

# remove columns filled with NaN completely
#bad_cols_test = df_test_all.columns[df_test_all.isnull().sum()==len(df_test_all)]
#df_train_all = df_train_all.drop(bad_cols_test, axis=1)
#df_test_all = df_test_all.drop(bad_cols_test, axis=1)

In [4]:
df_train_all = label_encode(df_train_all)
df_train_all = impute(df_train_all)
#df_train_all, scaler = Normalize(df_train_all)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train_all.drop(['price_doc'], axis=1), 
                                                    df_train_all['price_doc'], 
                                                    test_size=0.3, 
                                                    random_state=42)

print('Length of X_train data: ', X_train.shape) 
print('Length of y_train data: ', y_train.shape)
print('Length of X_test data:  ', X_test.shape)
print('Length of y_test data:  ', y_test.shape) 

Length of X_train data:  (21329, 388)
Length of y_train data:  (21329,)
Length of X_test data:   (9142, 388)
Length of y_test data:   (9142,)


## data Normalization

In [7]:
cols = X_train.columns
X_train, scaler = Normalize(X_train)

In [62]:
#train = X_train.join(y_train)
#test = X_test.join(y_test)
cols = test.columns
train, scaler = Normalize(train)
test = pd.DataFrame(scaler.transform(test))
test.columns = cols

y_train = train['price_doc']
X_train = train.drop('price_doc', axis=1)
y_test = test['price_doc']
X_test = test.drop('price_doc', axis=1)

print('Length of X_train data: ', X_train.shape) 
print('Length of y_train data: ', y_train.shape)
print('Length of X_test data:  ', X_test.shape)
print('Length of y_test data:  ', y_test.shape) 

Length of X_train data:  (21329, 388)
Length of y_train data:  (21329,)
Length of X_test data:   (9142, 388)
Length of y_test data:   (9142,)


## Functions definition

In [8]:
# Root Mean Squared Log error
def RMSLE(y, y_):
    return np.sqrt(np.mean(np.log((y+1)/(y_+1))**2))

def RMSE(y,y_):
    return np.sqrt(np.mean((y-y_)**2))

from sklearn.model_selection import KFold

def cv_score(estimator, X, y, cv=5):
    kf = KFold(cv)
    for train_index, test_index in kf.split(X):
        model = estimator.fit(X[train_index], y[train_index])
        #y_ = model
        
def model_eval(estimator, X_train, y_train, X_test, y_test):
    #train_cv_score = cross_val_score(xgr, X_train, y_train, cv=5)
    y_train_pred = cross_val_predict(estimator, X_train, y_train, cv=5)
    train_r2_score = r2_score(y_train, y_train_pred)
    train_mse = np.sqrt(np.mean((y_train-y_train_pred)**2))
    train_rmsle = RMSLE(y_train, y_train_pred)
    
    model = estimator.fit(X_train, y_train)
    y_test_pred = estimator.predict(X_test)
    test_r2_score = r2_score(y_test, y_test_pred)
    test_mse = np.sqrt(np.mean((y_test-y_test_pred)**2))
    test_rmsle = RMSLE(y_test, y_test_pred)

    #print 'train CV R2_Scores:     {}'.format(train_cv_score)
    print('train RMSLE:            {}', train_rmsle)
    print('train CV R2_Score:      {}', train_r2_score)
    print('train mse:              {}', train_mse)
    print('test rmsle:             {}', test_rmsle)
    print('test R2_Score:          {}', test_r2_score)
    print('test mse:               {}', test_mse)
    
    return y_train_pred, y_test_pred, model

## RandomForest Test

In [96]:
rf = RandomForestRegressor()
_ = model_eval(rf, X_train, y_train, X_test, y_test)

train RMSLE:            {} 0.480225508308
train CV R2_Score:      {} 0.631027301183
train mse:              {} 2940369.97915
test rmsle:             {} 0.477649639034
test R2_Score:          {} 0.663161074746
test mse:               {} 2690123.42147


## DNN model

In [119]:
y_train = np.log1p(y_train)

In [9]:
mean_ = np.mean(y_train)
std_ = y_train.std()
y_ = y_train.values-mean_
y_ = y_/std_
y_

array([-0.49664784,  0.13548171,  0.21327295, ..., -1.26925155,
       -0.11406502, -0.73214766])

In [34]:
y_train_norm = y_train_norm.reshape(len(y_))

In [35]:
1 - (np.sum((y_train_norm-y_)**2))/(np.sum(y_**2))

-0.36312768414760255

In [114]:
def step_decay(epoch):
    lr=0.1
    start=127
    step=40
    if epoch<start:
        return lr
    else:
        lr=lr/np.power(2.0,(1+(epoch-start)/step))
        return lr

def nn_model(X, y):
    
    #lrate = LearningRateScheduler(step_decay)
    
    
    model = Sequential()

    # layer 1
    #model_dnn.add(GaussianDropout(0.1))
    model.add(Dense(2048, input_dim=X.shape[1], kernel_initializer='random_uniform', bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    
    # layer 2
    model.add(Dropout(0.2))
    model.add(Dense(1024, input_dim=2048))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    # layer 3
    model.add(Dropout(0.2))
    model.add(Dense(512, input_dim=1024))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    # layer 4
    model.add(Dropout(0.3))
    model.add(Dense(1, input_dim=256))
    
    
    optimizer=SGD(lr=0, momentum=0.5,nesterov=True,clipnorm=100)
    model.compile(loss='mae', optimizer='adam')
    
    #model.fit(X, y, callbacks=[lrate], batch_size=128, epochs=64, verbose=1)
    
    return model


In [115]:
dnn = nn_model(X_train.values, y_)
lrate = LearningRateScheduler(step_decay)
dnn.fit(X_train.values, y_, callbacks=[lrate], batch_size=128, epochs=16, verbose=1)
y_train_norm = dnn.predict(X_train.values)
print('train R2', r2_score(y_, y_train_norm))
print('train rmse', RMSE(y_, y_train_norm))

#y_test_norm = dnn.predict(X_test.values)

#print ('test R2: ', r2_score(y_test, y_test_norm))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
train R2 -0.375757765988
train rmse 1.51526226188


## Cross Validation

In [277]:
from sklearn.model_selection import KFold

def cv_score(X, y, cv=5):
    y_ = np.zeros([len(y),1])
    for train_index, test_index in kf.split(X):
        dnn = nn_model()
        dnn.fit(X, y, callbacks=[lrate], batch_size=128, epochs=64, verbose=1)
        y_[test_index] = dnn.predict(X[test_index])
    r2 = r2_score(y, y_)
    return r2

In [339]:
X = X_train_norm.values
Y = y_train.copy()
kf = KFold(5)
y_ = np.zeros([len(y),1])
for train_index, test_index in kf.split(X):
    x = X[train_index]
    y = Y[train_index]
    dnn = nn_model()
    dnn.fit(x, y, callbacks=[lrate], batch_size=128, epochs=32, verbose=1)
    y_[test_index] = dnn.predict(X[test_index])
    

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32

KeyboardInterrupt: 