In [285]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import FastICA, KernelPCA, PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, \
f_regression, mutual_info_regression, VarianceThreshold, RFE
from sklearn.ensemble import RandomForestRegressor


SELECTORS = {'f_reg': f_regression, 'mut_info': mutual_info_regression}


EXTRACTORS = {'fast_ica': FastICA, 'kernel_pca': KernelPCA, 'pca': PCA,
              'trunk_svd': TruncatedSVD}


MODEL = RandomForestRegressor
MODEL = linear_model.Ridge
MODEL = linear_model.LinearRegression


In [286]:
train = pd.read_csv(filepath_or_buffer='features.txt', sep='\t')
test = pd.read_csv(filepath_or_buffer='featuresTest.txt', sep='\t')

In [287]:
train = train.as_matrix()
test = test.as_matrix()

In [288]:
def get_X_y(raw_data):
    y = np.array(raw_data[:, 1], dtype=np.float32)
    X = np.array(np.delete(raw_data, np.s_[0:4:1], 1), dtype=np.float32)
    return X, y

X_train, y_train = get_X_y(train)
X_test, y_test = get_X_y(test)

In [289]:
def build_model():
    return MODEL()
    
    
def print_rmse(selector_name, X, y, X_test, y_test):
    model = build_model()
    model.fit(X, y)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('feature selection {}, rmse={:.6f}'.format(selector_name, 
                                                     rmse))

def test_feature_selection():
    processor = VarianceThreshold(threshold=(0.99 * (1 - 0.99)))
    processor.fit(X_train)

    processed_X_train = processor.transform(X_train)
    processed_X_test = processor.transform(X_test)
    
    selector = SelectKBest(score_func=f_regression, k=10)
    rfe = RFE(estimator=build_model(), n_features_to_select=1, step=1)
    
    
    for name, opt in [('kBest', selector), ('rfe', rfe)]:
        opt.fit(processed_X_train, y_train)
        X_train_transformed = opt.transform(processed_X_train)
        X_test_transformed = opt.transform(processed_X_test)
        
        print_rmse(name, X_train_transformed, y_train, 
                   X_test_transformed, y_test)
    
    reg = build_model()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print('w/o feature selection, rmse={:.6f}'.format(rmse))

In [290]:
test_feature_selection()

feature selection kBest, rmse=0.077807
feature selection rfe, rmse=0.079861
w/o feature selection, rmse=0.076850


In [291]:
def test_feature_extraction(decomp_name):
    
    decomp = DECOMS[decomp_name](n_components=10)
    decomp.fit(X_train)
    X_train_decomp = decomp.transform(X_train)
    X_test_decomp = decomp.transform(X_test)
    
    clf = linear_model.LinearRegression()
    clf.fit(X_train_decomp, y_train)
    y_pred = clf.predict(X_test_decomp)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print('decom_name: {}, rmse={:.6f}'.format(decomp_name, rmse))

In [292]:
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('decom_name: {}, rmse={:.6f}'.format('None', rmse))

for name in DECOMS:
    test_feature_extraction(name)

decom_name: None, rmse=0.076850
decom_name: kernel_pca, rmse=0.077700
decom_name: pca, rmse=0.077700
decom_name: trunk_svd, rmse=0.077710
decom_name: fast_ica, rmse=0.077700
