## Importing necessary packages

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
seed = 42

In [7]:
# read datasets
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [27]:
from sklearn.decomposition import PCA, FastICA
n_comp = 100

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
#for i in range(1, n_comp+1):
#    train['pca_' + str(i)] = pca2_results_train[:,i-1]
#    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
#    train['ica_' + str(i)] = ica2_results_train[:,i-1]
#    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
#y_train = train["y"]
#y_mean = np.mean(y_train)
train_reduced = pd.concat([pd.DataFrame(pca2_results_train), pd.DataFrame(ica2_results_train)], axis = 1)
test_reduced = pd.concat([pd.DataFrame(pca2_results_test), pd.DataFrame(ica2_results_test)], axis = 1)

In [28]:
#X, y = train.drop('y', axis=1).values, train.y.values
#print(X.shape)
X, y = train_reduced, train.y.values
print X.shape

(4209, 200)


In [29]:
model = DecisionTreeRegressor(random_state=seed)

DTR_params = {
    'max_depth': [4,8],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1,2,4]  
}

reg = GridSearchCV(model, DTR_params, cv = 5, verbose=1, n_jobs = -1)
reg.fit(X, y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    6.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=42,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [2, 4, 8], 'max_depth': [2, 4, 8], 'min_samples_leaf': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [30]:
print(reg.best_score_)
print(reg.best_params_)

0.370429167663
{'min_samples_split': 2, 'max_depth': 4, 'min_samples_leaf': 4}


In [26]:
means = reg.cv_results_['mean_test_score']
stds = reg.cv_results_['std_test_score']
params = reg.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.261224 (0.016463) with: {'min_samples_split': 2, 'max_depth': 2, 'min_samples_leaf': 1}
0.261224 (0.016463) with: {'min_samples_split': 4, 'max_depth': 2, 'min_samples_leaf': 1}
0.261224 (0.016463) with: {'min_samples_split': 8, 'max_depth': 2, 'min_samples_leaf': 1}
0.260896 (0.014668) with: {'min_samples_split': 2, 'max_depth': 2, 'min_samples_leaf': 2}
0.260896 (0.014668) with: {'min_samples_split': 4, 'max_depth': 2, 'min_samples_leaf': 2}
0.260896 (0.014668) with: {'min_samples_split': 8, 'max_depth': 2, 'min_samples_leaf': 2}
0.261132 (0.015001) with: {'min_samples_split': 2, 'max_depth': 2, 'min_samples_leaf': 4}
0.261132 (0.015001) with: {'min_samples_split': 4, 'max_depth': 2, 'min_samples_leaf': 4}
0.261132 (0.015001) with: {'min_samples_split': 8, 'max_depth': 2, 'min_samples_leaf': 4}
0.270803 (0.141162) with: {'min_samples_split': 2, 'max_depth': 4, 'min_samples_leaf': 1}
0.272072 (0.142596) with: {'min_samples_split': 4, 'max_depth': 4, 'min_samples_leaf': 1}
0.272072 (

In [37]:
# make predictions and save results
y_pred = xgb_model.predict(x_test)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
#output.to_csv('xgb_6.csv', index=False)

## Trying base data with lasso

In [34]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(max_iter=6000)

lasso_params = {
    'max_iter': [5000, 6000, 7000],
    'alpha': [1.55, 1.57, 1.6],
    'fit_intercept': [True,False],
    'normalize': [True, False],
    'precompute': [True, False],
    'tol': [0.004, 0.0045, 0.005],
    'selection': ['random', 'cyclic']
}

lasso_reg_cv = GridSearchCV(lasso_reg, lasso_params, cv = 5, verbose=1, n_jobs = -1)
lasso_reg_cv.fit(X, y)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 914 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 1614 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=6000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'normalize': [True, False], 'selection': ['random', 'cyclic'], 'fit_intercept': [True, False], 'max_iter': [5000, 6000, 7000], 'precompute': [True, False], 'tol': [0.004, 0.0045, 0.005], 'alpha': [1.55, 1.57, 1.6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [35]:
print(lasso_reg_cv.best_score_)
print(lasso_reg_cv.best_params_)

means = lasso_reg_cv.cv_results_['mean_test_score']
stds = lasso_reg_cv.cv_results_['std_test_score']
params = lasso_reg_cv.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.433292773306
{'normalize': False, 'selection': 'random', 'fit_intercept': True, 'max_iter': 6000, 'precompute': False, 'tol': 0.005, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, 'selection': 'random', 'fit_intercept': True, 'max_iter': 5000, 'precompute': True, 'tol': 0.004, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, 'selection': 'random', 'fit_intercept': True, 'max_iter': 5000, 'precompute': True, 'tol': 0.0045, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, 'selection': 'random', 'fit_intercept': True, 'max_iter': 5000, 'precompute': True, 'tol': 0.005, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, 'selection': 'cyclic', 'fit_intercept': True, 'max_iter': 5000, 'precompute': True, 'tol': 0.004, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, 'selection': 'cyclic', 'fit_intercept': True, 'max_iter': 5000, 'precompute': True, 'tol': 0.0045, 'alpha': 1.55}
-0.008255 (0.011708) with: {'normalize': True, '