In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import scipy.io as sio
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.linear_model import Ridge, BayesianRidge, LinearRegression
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
# linear_model.Ridge()

In [10]:
clf_names = ["Ridge",
             "BayesianRidge",
             "LinearRegression"
            ]
classifiers = [Ridge(),
               BayesianRidge(),
               LinearRegression()
              ]

In [11]:
def print_top_features(clf, top_k=10, feature_names = []):
    """Prints features with the highest coefficient values, per class"""
#     feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        print clf.coef_
        top_features_idx = np.argsort(clf.coef_[i])[-top_k:]
        if feature_names == []:
            print("{}:{}".format(class_label, top_features_idx))
        else:
            print("%s: %s" % (class_label,
                  " ".join(feature_names[j] for j in top_features_idx)))

In [12]:
def plot_important_features(features, clf, feature_name_list=[], num_selected_features=50):
    importances = clf.feature_importances_
    X = np.array(features)
    std = np.std([tree.feature_importances_ for tree in clf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(0,num_selected_features):
        if feature_name_list == []:
            print("%d, (%f)" % (f + 1, importances[indices[f]]))
        else:
            print("%d. feature %s (%f)" % (f + 1, feature_name_list[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(num_selected_features), importances[indices[:num_selected_features]],
           color="r", yerr=std[indices[:num_selected_features]], align="center")
    plt.xticks(range(num_selected_features), indices[:num_selected_features])
    plt.xlim([-1, num_selected_features])
    plt.show()

In [18]:
def run_experiments(X, Y, feature_name_list=[], top_k=50):
    # iterate over classifiers
    '''
    print('Accuracy of different classifier, without Normalization')
    for name, clf in zip(clf_names, classifiers):
        scores = cross_val_score(clf, X, Y, cv=5)
        print("{}, {}/{}".format(name, np.mean(scores), np.std(scores)))
    
    adaboost_clf = AdaBoostClassifier()
    adaboost_clf.fit(X, Y)
    plot_important_features(X, adaboost_clf, feature_name_list, 50)
    '''
    
    print('Accuracy of different classifier, with Normalization')
    normalization_methods_name = ['StandardScaler', 'Normalizer']#, 'RobustScaler'
    normalization_methods = [StandardScaler(), Normalizer()]#, RobustScaler(),
    # iterate over classifiers
    for name, clf in zip(clf_names, classifiers):
        print('\n')
        for norm_name, norm in zip(normalization_methods_name, normalization_methods):
            loo = cross_validation.KFold(5)
            scores = cross_validation.cross_val_score(make_pipeline(norm, clf), X, Y, scoring='neg_mean_squared_error', cv=loo)
            clf.fit(X,Y)
            print('{},{},mean_squared_error:{}'.format(name, norm_name,mean_squared_error(Y, clf.predict(X))))
#             print("{},{}, {}/{}".format(name, norm_name, np.mean(scores), np.std(scores)))
#             print clf.coef_
#     plot_important_features(X_scaled, clf, feature_name_list, top_k)
    

# prepare data into libsvm format

In [19]:
import numpy as np
import pandas as pd

data_file_list = ['network_language', 'network_perIQ', 'network_spatial', 
                  'network_verIQ','volume_language','volume_perIQ',
                  'volume_spatial', 'volume_verIQ']
data_path = "../data/featsele_IQ_Cog/"
for data_file in data_file_list:
    print('###############################################\n')
    print('Data:{}\n'.format(data_file))
    # comma delimited is the default
    df = pd.read_csv(os.path.join(data_path, data_file + '.csv'), header = 0)

    # put the original column names in a python list
    original_headers = list(df.columns.values)
    feature_name_list = original_headers[1:]
    # remove the non-numeric columns
    df = df._get_numeric_data()

    # put the numeric column names in a python list
    numeric_headers = list(df.columns.values)

    # create a numpy array with the numeric values for input into scikit-learn
    numpy_array = np.array(df.as_matrix())
    Y = numpy_array[:,0]
    X = numpy_array[:,1:]
    run_experiments(X, Y, feature_name_list)

###############################################

Data:network_language

Accuracy of different classifier, with Normalization


Ridge,StandardScaler,mean_squared_error:43.3061959802
Ridge,Normalizer,mean_squared_error:43.3061959802


BayesianRidge,StandardScaler,mean_squared_error:43.5859108922
BayesianRidge,Normalizer,mean_squared_error:43.5859108922


LinearRegression,StandardScaler,mean_squared_error:39.0053979308
LinearRegression,Normalizer,mean_squared_error:39.0053979308
###############################################

Data:network_perIQ

Accuracy of different classifier, with Normalization


Ridge,StandardScaler,mean_squared_error:77.2349732587
Ridge,Normalizer,mean_squared_error:77.2349732587


BayesianRidge,StandardScaler,mean_squared_error:77.4652875978
BayesianRidge,Normalizer,mean_squared_error:77.4652875978


LinearRegression,StandardScaler,mean_squared_error:66.0398924589
LinearRegression,Normalizer,mean_squared_error:66.0398924589
#########################################