### Load data and clean targets

In [1]:
household_id = 'idhogar'
head_of_household = 'parentesco1'
person_id = 'Id'
target_column = 'Target'

In [2]:
%load_ext autoreload
from data_cleaning import get_training_data, get_test_data, target_by_household

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
%autoreload 2
train = get_training_data()
test = get_test_data()

Loading data from data/train.csv...
(9557, 142)

Checking for inconsistent targets...
(85,)
(9557, 142)
Cleaning inconsistent targets...
Checking inconsistent targets are gone...
(0,)
(9557, 142)

Loading data from data/test.csv...
(23856, 141)



### Get training and validation data

In [32]:
from column_categories import building_info
building_columns = [household_id]
building_columns.extend(building_info)

In [33]:
building_df = train[building_columns]

In [34]:
from data_cleaning import get_column_dtypes
get_column_dtypes(building_df)

{'int64': Index(['paredblolad', 'paredzocalo', 'paredpreb', 'pareddes', 'paredmad',
        'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisocemento',
        'pisoother', 'pisonatur', 'pisonotiene', 'pisomadera', 'techozinc',
        'techoentrepiso', 'techocane', 'techootro', 'cielorazo',
        'abastaguadentro', 'abastaguafuera', 'abastaguano', 'public', 'planpri',
        'noelec', 'coopele', 'sanitario1', 'sanitario2', 'sanitario3',
        'sanitario5', 'sanitario6', 'energcocinar1', 'energcocinar2',
        'energcocinar3', 'energcocinar4', 'elimbasu1', 'elimbasu2', 'elimbasu3',
        'elimbasu4', 'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
        'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3'],
       dtype='object'), 'object': Index(['idhogar'], dtype='object')}

Our target value is a measure per household so we should group households first

In [35]:
target_household_map = target_by_household(train)

In [36]:
building_df = building_df.groupby(household_id).any().astype(int)
building_df = building_df.join(target_household_map)

In [37]:
from data_cleaning import target_table_breakdown
target_table_breakdown(building_df)

Unnamed: 0,total,proportion,target description
4,1955,0.654284,non vulnerable households
2,442,0.147925,moderate poverty
3,369,0.123494,vulnerable households
1,222,0.074297,extreme poverty


In [91]:
from sklearn import model_selection
X = building_df.drop(target_column, axis=1)
y = building_df[target_column]
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y)

### Linear SVC without SMOTE

In [99]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1.0)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.66110184, 0.65609349, 0.65829146, 0.65661642, 0.66610738])

There's a big imbalance in classes, before trying oversampling methods attempt manually setting some class weights

### Oversample for class imbalances

#### SMOTE

In [39]:
from imblearn.over_sampling import SMOTE

In [100]:
print('Before OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '2': {}".format(sum(y_train==2)))
print("Before OverSampling, counts of label '3': {}".format(sum(y_train==3)))
print("Before OverSampling, counts of label '4': {} \n".format(sum(y_train==4)))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '2': {}".format(sum(y_train_res==2)))
print("After OverSampling, counts of label '3': {}".format(sum(y_train_res==3)))
print("After OverSampling, counts of label '4': {}".format(sum(y_train_res==4)))

Before OverSampling, the shape of train_X: (2241, 50)
Before OverSampling, the shape of train_y: (2241,) 

Before OverSampling, counts of label '1': 163
Before OverSampling, counts of label '2': 346
Before OverSampling, counts of label '3': 274
Before OverSampling, counts of label '4': 1458 

After OverSampling, the shape of train_X: (5832, 50)
After OverSampling, the shape of train_y: (5832,) 

After OverSampling, counts of label '1': 1458
After OverSampling, counts of label '2': 1458
After OverSampling, counts of label '3': 1458
After OverSampling, counts of label '4': 1458


### Try KNN

Not sure how good an idea this is as SMOTE generates extra training data using KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

In [20]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [21]:
knn.score(X_valid, y_valid)

0.6492637215528781

### Try Linear SVM

In [22]:
from sklearn.svm import SVC  
svc = SVC(kernel='linear')  
svc.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
svc.score(X_valid, y_valid)

0.6532797858099063

### Compare SVN with different kernels

In [101]:
from sklearn import svm

C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X_train, y_train) for clf in models)

# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel')

for clf, title in zip(models, titles):
    print(title)
    print(clf.score(X_valid, y_valid))

SVC with linear kernel
0.6693440428380187
LinearSVC (linear kernel)
0.6653279785809906
SVC with RBF kernel
0.6492637215528781
SVC with polynomial (degree 3) kernel
0.6653279785809906


### Try Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
rf.fit(X_train, y_train);

In [None]:
rf.score(X_valid, y_valid)