### Load Training Data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
from kernel_submission import *
import scipy

def load_train_data(filepath='data/train.csv'):
    train_df = get_training_data(filepath)
    train_df = clean_data(train_df)
    train_df = train_df.reset_index()[hh_columns+['idhogar', 'Target']]
    target_household_map = target_by_household(train_df)
    train_df = train_df.drop(target_column, axis=1).groupby(household_id).agg(lambda x: scipy.stats.mode(x)[0])
    train_df = train_df.join(target_household_map)
    train_df = compress_column_data(train_df)
    train_df = add_custom_features(train_df)
    train_df['v2a1'] = train_df['v2a1'].astype(float)
    return train_df

In [None]:
def get_valid_train_split(df, valid_class_size):
    v = get_balanced_data(df, valid_class_size)
    t = pd.concat([df.reset_index(), v.reset_index()]).drop_duplicates(keep=False).set_index(household_id)
    sample_min = target_table_breakdown(t)['total'].min()
    t = get_balanced_data(t, sample_min)
    return v, t

In [None]:
train_df = load_train_data()

### Train Extreme Poverty Classifier

In [None]:
is_extreme = convert_to_binary_targets(train_df, 1)

In [None]:
# Feed all data into cross validator - it will split out valid data itself
data = get_balanced_data(is_extreme)

# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# Labels for training
train_labels = np.array(list(data['Target'].astype(np.uint8)))
# Extract the training data
train_set = data.drop(columns = ['Id', 'Target'])

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_set, train_labels, cv = 10, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

In [None]:
v_extreme, t_extreme = get_valid_train_split(is_extreme, 50)
v_extreme['Target'].value_counts()

In [None]:
# Use our train-valid split to check classification report
from sklearn.metrics import classification_report

train_labels = np.array(list(t_extreme['Target'].astype(np.uint8)))
train_set = t_extreme.drop(columns = ['Id', 'Target'])
pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

test_set = v_extreme.drop(columns = ['Id', 'Target'])
test_set = pipeline.transform(test_set)

clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
clf.fit(train_set, train_labels)

preds = clf.predict(test_set)
print(classification_report(v_extreme['Target'], preds))

### Train Most Wealth Classifier

In [None]:
is_wealthy = convert_to_binary_targets(train_df, 4)
# Feed all data into cross validator - it will split out valid data itself
data = get_balanced_data(is_wealthy)
data['Target'].value_counts()

In [None]:
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# Labels for training
train_labels = np.array(list(data['Target'].astype(np.uint8)))
# Extract the training data
train_set = data.drop(columns = ['Id', 'Target'])

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_set, train_labels, cv = 10, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

In [None]:
v_extreme, t_extreme = get_valid_train_split(is_wealthy, 50)
v_extreme['Target'].value_counts()

In [None]:
# Use our train-valid split to check classification report
from sklearn.metrics import classification_report

train_labels = np.array(list(t_extreme['Target'].astype(np.uint8)))
train_set = t_extreme.drop(columns = ['Id', 'Target'])
pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

test_set = v_extreme.drop(columns = ['Id', 'Target'])
test_set = pipeline.transform(test_set)

clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
clf.fit(train_set, train_labels)

preds = clf.predict(test_set)
print(classification_report(v_extreme['Target'], preds))

### Target 3 Classifier

In [None]:
is_3 = convert_to_binary_targets(train_df, 3)
# Feed all data into cross validator - it will split out valid data itself
data = get_balanced_data(is_3)
data['Target'].value_counts()

In [None]:
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# Labels for training
train_labels = np.array(list(data['Target'].astype(np.uint8)))
# Extract the training data
train_set = data.drop(columns = ['Id', 'Target'])

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_set, train_labels, cv = 10, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

In [None]:
v, t = get_valid_train_split(is_3, 50)

train_labels = np.array(list(t['Target'].astype(np.uint8)))
train_set = t.drop(columns = ['Id', 'Target'])
pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

test_set = v.drop(columns = ['Id', 'Target'])
test_set = pipeline.transform(test_set)

clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
clf.fit(train_set, train_labels)

preds = clf.predict(test_set)
print(classification_report(v['Target'], preds))

### Train 2 Classifier

In [None]:
is_2 = convert_to_binary_targets(train_df, 2)
# Feed all data into cross validator - it will split out valid data itself
data = get_balanced_data(is_2)
data['Target'].value_counts()

In [None]:
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# Labels for training
train_labels = np.array(list(data['Target'].astype(np.uint8)))
# Extract the training data
train_set = data.drop(columns = ['Id', 'Target'])

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

model = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_set, train_labels, cv = 10, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')

In [None]:
v, t = get_valid_train_split(is_2, 50)

train_labels = np.array(list(t['Target'].astype(np.uint8)))
train_set = t.drop(columns = ['Id', 'Target'])
pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

test_set = v.drop(columns = ['Id', 'Target'])
test_set = pipeline.transform(test_set)

clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
clf.fit(train_set, train_labels)

preds = clf.predict(test_set)
print(classification_report(v['Target'], preds))

### All In One Classifier

In [None]:
v, t = get_valid_train_split(train_df, 50)

In [None]:
train_labels = np.array(list(t['Target'].astype(np.uint8)))
train_set = t.drop(columns = ['Id', 'Target'])
pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)

test_set = v.drop(columns = ['Id', 'Target'])
test_set = pipeline.transform(test_set)

clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
clf.fit(train_set, train_labels)

preds = clf.predict(test_set)
print(classification_report(v['Target'], preds))

### Make 4 Classifiers

In [None]:
v, t = get_valid_train_split(train_df, 50)
# Get all from train_df except v data, split into binary for each class, get balanced data
t = pd.concat([train_df.reset_index(), v.reset_index()]).drop_duplicates(keep=False).set_index(household_id)
t['Target'].value_counts()

In [None]:
is_1 = get_balanced_data(convert_to_binary_targets(t, 1))
is_2 = get_balanced_data(convert_to_binary_targets(t, 2))
is_3 = get_balanced_data(convert_to_binary_targets(t, 3))
is_4 = get_balanced_data(convert_to_binary_targets(t, 4))

In [None]:
is_4['Target'].value_counts()

In [None]:
def train_clf(df):
    train_labels = np.array(list(df['Target'].astype(np.uint8)))
    train_set = df.drop(columns = ['Id', 'Target'])
    pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                          ('scaler', MinMaxScaler())])
    train_set = pipeline.fit_transform(train_set)
    clf = RandomForestClassifier(n_estimators=100, random_state=10, n_jobs = -1)
    clf.fit(train_set, train_labels)
    return pipeline, clf

In [None]:
def test_clf(pipeline, clf, test_data):
    test_set = test_data.drop(columns = ['Id', 'Target'])
    test_set = pipeline.transform(test_set)
    return clf.predict_proba(test_set)

In [None]:
p_1, c_1 = train_clf(is_1)
pred_1 = pd.DataFrame(test_clf(p_1, c_1, convert_to_binary_targets(v, 1))).set_index(v['Id']).rename(columns={0:'0',1:'1'})

p_2, c_2 = train_clf(is_2)
pred_2 = pd.DataFrame(test_clf(p_2, c_2, convert_to_binary_targets(v, 2))).set_index(v['Id']).rename(columns={0:'0',1:'2'})

p_3, c_3 = train_clf(is_3)
pred_3 = pd.DataFrame(test_clf(p_3, c_3, convert_to_binary_targets(v, 3))).set_index(v['Id']).rename(columns={0:'0',1:'3'})

p_4, c_4 = train_clf(is_4)
pred_4 = pd.DataFrame(test_clf(p_4, c_4, convert_to_binary_targets(v, 4))).set_index(v['Id']).rename(columns={0:'0',1:'4'})

In [None]:
results = pd.concat([pred_1['1'], pred_2['2'], pred_3['3'], pred_4['4']], axis=1)

In [None]:
def boost_results(df):
    df['1'] = df['1']*1.25
    df['2'] = df['2']*1.15
    df['3'] = df['3']*1.05
    df['4'] = df['4']*0.95
    return df

In [None]:
results = boost_results(results)

In [None]:
preds = results.idxmax(axis=1)

In [None]:
v2 = v.reset_index().drop(columns=['idhogar']).set_index('Id')

In [None]:
j = pd.concat([preds, v2['Target']], axis=1)

In [None]:
j = j.astype(int)

In [None]:
print(classification_report(j['Target'], j[0]))

In [None]:
print(classification_report(j['Target'], j[0]))

In [None]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(j['Target'], j[0], labels=[1,2,3,4])

In [None]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set a few plotting defaults
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 12
plt.rcParams['patch.edgecolor'] = 'k'

In [None]:
import itertools 
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
class_names=['1','2','3','4']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()