### Load Data

In [3]:
household_id = 'idhogar'
target_column = 'Target'

from data_cleaning import get_training_data, get_test_data
train = get_training_data()
test = get_test_data()

from column_categories import building_info
building_columns = [household_id]
building_columns.extend(building_info)
building_df = train[building_columns]

from data_cleaning import target_by_household
target_household_map = target_by_household(train)
building_df = building_df.groupby(household_id).any().astype(int)
building_df = building_df.join(target_household_map)

  return f(*args, **kwds)


Loading data from data/train.csv...
(9557, 142)

Checking for inconsistent targets...
(85,)
(9557, 142)
Cleaning inconsistent targets...
Checking inconsistent targets are gone...
(0,)
(9557, 142)

Loading data from data/test.csv...
(23856, 141)



### Build Pipeline

In [4]:
X = building_df.drop([target_column], axis=1)
y = building_df[target_column]

In [None]:
from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [10, 18, 26]
C_OPTIONS = [1, 10, 100]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)
grid.fit(X, y)

In [None]:
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

Convert scores to DataFrame for plotting

In [None]:
scores = []
for label, reducer_scores in zip(reducer_labels, mean_scores):
    scores.extend(list((N_FEATURES_OPTIONS[i], s, label) for i, s in enumerate(reducer_scores)))
data = pd.DataFrame.from_records(scores)
data.columns = ['Number of features', 'Accuracy', 'Technique']
data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# sns.set(style="ticks", color_codes=True)
lm = sns.catplot(x='Number of features', y='Accuracy', hue='Technique', kind='bar', data=data)

val_range = data['Accuracy'].max() - data['Accuracy'].min()
y_min = data['Accuracy'].min()-(val_range*0.1)
y_max = data['Accuracy'].max()+(val_range*0.1)

axes = lm.axes
axes[0,0].set_ylim(y_min, y_max)