In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from data_cleaning import get_training_data, get_test_data, clean_non_numeric_features, target_by_household, target_table_breakdown

household_id = 'idhogar'
target_column = 'Target'

def get_household_level_data():
    train_ind = get_training_data()
    train_ind = clean_non_numeric_features(train_ind)
    target_household_map = target_by_household(train_ind)
    # Need
    train_hh = train_ind.drop(columns=target_column).groupby(household_id).any().astype(int)
    return train_hh.join(target_household_map)
    
def get_balanced_data(df, n, random_state=1):
    return df.sample(frac=1, random_state=random_state).groupby(target_column).head(n)

def convert_to_binary_targets(df, true_target):
    df = df.copy()
    df[target_column] = np.where(df[target_column]==true_target, 1, 0)
    return df

def feature_selector(selector,data, target):
    # Fit the model
    selector.fit(data, target)
    # Retain the feature names
    features = selector.get_support(indices = True)  # Returns array of indexes of nonremoved features
    k_features = [data.columns.values[i] for i in features]
#     k_features.append(target_column)
    return k_features

def train(clf, train_data, target_value):
    is_n = train_data.loc[train_data[target_column]<=target_value]
    is_n = convert_to_binary_targets(is_n, target_value)
    
    sel = SelectKBest(chi2, k=15)
    k_features = feature_selector(sel, is_n.drop(target_column, axis=1), is_n[target_column])
    tmp = k_features.copy()
    tmp.append(target_column)
    print(k_features)
    is_n = is_n[tmp]
    
    sample_max = target_table_breakdown(is_n)['total'].max()
    is_n = get_balanced_data(is_n, sample_max, random_state=10)
    clf.fit(is_n.drop(target_column, axis=1), is_n[target_column])
    return clf, k_features

def test(clf, k_features, valid_data, target_value):    
    is_n_valid = valid_data.loc[valid_data[target_column]<=target_value]
    is_n_valid = convert_to_binary_targets(is_n_valid, target_value)
    sample_max = target_table_breakdown(is_n_valid)['total'].max()
    is_n_valid = get_balanced_data(is_n_valid, sample_max, random_state=10)
    
    preds = clf.predict(is_n_valid[k_features].drop(target_column, axis=1))
    print(classification_report(is_n_valid[target_column], preds))

In [25]:
data = get_household_level_data()

Loading data from data/train.csv...
(9557, 142)

Checking for inconsistent targets...
(85,)
(9557, 142)
Cleaning inconsistent targets...
Checking inconsistent targets are gone...
(0,)
(9557, 142)



In [34]:
from sklearn.svm import SVC  
clf_4 = SVC(kernel='linear')
clf_4, k_features_4 = train(clf_4, data.copy(), 4)

['v18q', 'v18q1', 'rez_esc', 'pisocemento', 'cielorazo', 'epared1', 'epared3', 'etecho1', 'etecho3', 'eviv1', 'eviv3', 'instlevel1', 'instlevel2', 'instlevel8', 'computer']


In [35]:
clf_3 = SVC(kernel='linear')
clf_3, k_features_3 = train(clf_3, data.copy(), 3)

['paredblolad', 'paredzinc', 'pisomoscer', 'pisocemento', 'cielorazo', 'epared1', 'epared3', 'etecho1', 'eviv1', 'eviv3', 'estadocivil3', 'estadocivil5', 'edjefa', 'instlevel6', 'instlevel8']


In [36]:
clf_2 = SVC(kernel='linear')
clf_2, k_features_2 = train(clf_2, data.copy(), 2)

['hacdor', 'r4m1', 'r4t1', 'techocane', 'etecho1', 'etecho3', 'estadocivil1', 'estadocivil2', 'estadocivil5', 'estadocivil6', 'parentesco2', 'parentesco4', 'hogar_mayor', 'edjefe', 'SQBedjefe']


In [43]:
def load_and_clean_test_data():
    test = get_test_data()
    test = clean_non_numeric_features(test)
    ## Ermm.... any?
    test_hh = test.groupby(household_id).any().astype(int)
    return test_hh

test = load_and_clean_test_data()
test
# preds = clf_4.predict(test[k_features_4])

Loading data from data/test.csv...
(23856, 141)



Unnamed: 0_level_0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
idhogar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000a08204,1,0,1,0,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
000bce7c4,0,0,1,0,1,1,0,0,0,1,...,1,1,1,1,0,0,1,0,1,1
001845fb0,1,0,1,0,1,1,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
003514e22,0,0,1,0,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
003b51a87,0,0,1,0,1,1,0,0,1,1,...,1,1,1,1,0,1,1,1,1,1
003bf2fa9,0,0,1,0,1,1,0,0,0,0,...,1,1,1,1,0,0,1,1,1,1
004417a6c,0,0,1,0,1,1,0,0,0,0,...,1,1,1,1,0,0,1,0,1,1
0044a5929,0,0,1,0,1,1,0,0,0,0,...,1,1,1,1,0,0,1,1,1,1
00498cd58,0,0,1,0,1,1,0,0,0,1,...,1,1,1,1,1,0,1,1,1,1
004ea2211,0,0,1,0,1,1,0,0,0,1,...,1,1,1,1,1,0,1,0,1,1


In [None]:
test[household_id]