In [1]:
import pandas as pd

adult = pd.read_csv('adult.data', header=None,
                    names=['Age', 'Work-Class', 'fnlwgt', 'Education', "Education-Num",
                          "Marital-Status", "Occupation", 'Relationship', "Race", 'Sex',
                          "Capital-gain", "Capital-loss", "Hours-per-week", 'Native-Country',
                          'Earnings-Raw'])
adult.dropna(how='all', inplace=True)
adult.columns

Index(['Age', 'Work-Class', 'fnlwgt', 'Education', 'Education-Num',
       'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-Country',
       'Earnings-Raw'],
      dtype='object')

In [2]:
adult['Hours-per-week'].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: Hours-per-week, dtype: float64

In [3]:
adult['Education-Num'].median()

10.0

In [4]:
adult['Work-Class'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [5]:
adult['LongHours'] = adult['Hours-per-week'] > 40

In [6]:
X = adult[['Age', 'Education-Num', 'Capital-gain', 'Capital-loss', 'Hours-per-week']].values
y = (adult['Earnings-Raw'] == ' >50K').values

In [7]:
from sklearn.feature_selection import SelectKBest, chi2
transformer = SelectKBest(score_func=chi2, k=3)

In [8]:
Xt_chi2 = transformer.fit_transform(X, y)
print(transformer.scores_)

[8.60061182e+03 2.40142178e+03 8.21924671e+07 1.37214589e+06
 6.47640900e+03]


In [11]:
from scipy.stats import pearsonr
import numpy as np

def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:, column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

In [12]:
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

[0.2340371  0.33515395 0.22332882 0.15052631 0.22968907]


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
score_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
score_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')

In [14]:
print("Chi2 performs: %.3f" % score_chi2.mean())
print("Pearson performs: %.3f" % score_pearson.mean())

Chi2 performs: 0.829
Pearson performs: 0.771
