In [7]:
import os
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm
from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb
import utils
from constants import DATA_DIR
warnings.filterwarnings('ignore')

Lasso Regression

In [8]:
weight = 'linear'
df = pd.read_csv(os.path.join(DATA_DIR,'ABT','abt.csv'))
X = df.loc[:, df.columns != 'VALUE']
y = df['VALUE']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=None)
cv = RepeatedKFold(n_splits=2, n_repeats=100)
#print(sum(mean) / len(mean))

In [64]:
lasso_model = Lasso()
parameters = {'alpha':[1e-2, 1e-1, 1]}
lasso_result = GridSearchCV(lasso_model, parameters, scoring='neg_root_mean_squared_error',cv=cv,n_jobs=-1)
#scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv)
lasso_result.fit(X_train, y_train)
print(lasso_result.best_estimator_)
# best model
best_model = lasso_result.best_estimator_
y_pred = best_model.predict(X_test)
print(mse(y_test, y_pred, squared=False))
print(mae(y_test, y_pred))

Lasso(alpha=0.01)
2.55924839614277
1.9464006009433386


Ridge Regression

In [78]:
df = pd.read_csv(os.path.join(DATA_DIR,'ABT','data.csv'))
X = df.loc[:, df.columns != 'VALUE']
y = df['VALUE']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=None)
cv = RepeatedKFold(n_splits=2, n_repeats=1)

In [66]:
model = Ridge()
parameters = {'alpha':[1, 10, 100]}
Ridge_reg= GridSearchCV(model, parameters, scoring='neg_root_mean_squared_error',cv=cv,n_jobs=-1)
#fit the grid search
Ridge_reg.fit(X_train,y_train)
# best estimator
print(Ridge_reg.best_estimator_)
# best model
best_model = Ridge_reg.best_estimator_
y_pred = best_model.predict(X_test)
print(mse(y_test, y_pred, squared=False))
print(mae(y_test, y_pred))

Ridge(alpha=10)
2.5742715742592477
1.9642500748716225


XGBoost Regression

In [71]:
xg_reg = XGBRegressor()

In [75]:
xg_reg.fit(X_train,y_train)
y_pred = xg_reg.predict(X_test)
print(mse(y_test, y_pred, squared=False))
print(mae(y_test, y_pred))

1.9398655941364524
0.7815201564553448


In [98]:
params = {'min_child_weight': [1, 5, 10], 
          'gamma': [0.5, 1, 1.5, 2, 5], 
          'max_depth': [3, 4, 5]
        }

In [102]:
xg_reg = XGBRegressor()
xg = BayesSearchCV(xg_reg, search_spaces=params, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
xg.fit(X_train, y_train)
# best estimator
best_model = xg.best_estimator_
print(best_model)
# best model
y_pred = best_model.predict(X_test)
print(mse(y_test, y_pred, squared=False))
print(mae(y_test, y_pred))

TypeError: __init__() got an unexpected keyword argument 'iid'

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
print(X_train.values)
# evaluate model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
results = cross_val_score(pipeline, X_train, y_train, cv=cv)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

[[23.         73.33333333 39.16666667 ...  0.          0.
   1.        ]
 [26.         76.          5.83333333 ...  0.          0.
   0.        ]
 [28.         71.66666667  3.         ...  0.          0.
   1.        ]
 ...
 [26.         49.16666667  3.66666667 ...  0.          0.
   1.        ]
 [25.         34.83333333 11.16666667 ...  0.          0.
   1.        ]
 [34.         56.16666667 13.66666667 ...  0.          0.
   0.        ]]
