### Predict probabilities on all test images using the saved classifier 

In [1]:
import pickle
from os.path import join
import numpy as np
import pandas as pd

**1.** Load the trained classifier

In [2]:
pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'classifier_br_bs_m3.pkl'), 'rb') as f:
    clf = pickle.load(f)

In [3]:
regions = ['borde_rural', 'borde_soacha', 'mixco_1_and_ebenezer', 'mixco_3', 'dennery']
#regions = ['mixco_3']

**2.** Collect sets of test features and concatenate them into one

In [4]:
columns = ['id', 'features']
df_test_global = pd.DataFrame(columns=columns)

for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_features_' + region + '_test.pkl'), 'rb') as f:
            df_test = pickle.load(f)
            df_test_global = pd.concat([df_test_global, df_test])
    except:
        print("Error reading test data for region ", region)

In [5]:
df_test_global.head()

Unnamed: 0,id,features
0,7a44da50,"[3.915693, 3.9185672, 3.345753, 0.123248816, 0..."
1,7a44dcf8,"[4.8018703, 2.8915873, 2.8877943, 2.8772202, 0..."
2,7a44dd66,"[1.3122209, 6.605673, 2.665344, 0.68282366, 0...."
3,7a44df46,"[0.0, 2.925912, 0.2874285, 0.0, 2.95557, 0.000..."
4,7a44dfb4,"[2.6583984, 2.3240876, 1.397907, 0.42449367, 0..."


**3.** Predict the probabilities for every test example and put them all in a new dataframe

In [6]:
#import math

#def sigmoid(x):
#    return 1 / (1 + math.exp(-x))

In [7]:
columns = ['id', 'pred_prob']
df_results = pd.DataFrame(columns=columns)
data = []

for i, row in df_test_global.iterrows():
    
    features = row['features'].reshape(1, -1)
    pred_prob = clf.predict_proba(features)[0]
    #print(pred_prob)
    #map(sigmoid, pred_prob)
    
    d = {'id': row['id'], 'pred_prob': pred_prob}
    data.append(d)

df_results = pd.DataFrame(data)
#df_results.features.apply(lambda features: pd.Series(list(features)))

Annotate the MultiPolygons by hand

In [8]:
pred_prob = [0.2, 0.2, 0.2, 0.2, 0.2]

data = [
    {'id': '7a4cb770', 'pred_prob': pred_prob},
    {'id': '7a4715fe', 'pred_prob': pred_prob},
    {'id': '7a4b8850', 'pred_prob': pred_prob},
    {'id': '7a451c90', 'pred_prob': pred_prob},
    {'id': '7a4efc74', 'pred_prob': pred_prob},
    {'id': '7a4d32b8', 'pred_prob': pred_prob},
    {'id': '7a4ec4ac', 'pred_prob': pred_prob},
    {'id': '7a46856c', 'pred_prob': pred_prob},
    {'id': '7a46f6dc', 'pred_prob': pred_prob},
    {'id': '7a4ae9f4', 'pred_prob': pred_prob}  
]
    
df_multipolygon = pd.DataFrame(data)

In [9]:
df_results = pd.concat([df_results, df_multipolygon])

In [10]:
df_pred_prob = df_results.pred_prob.apply(lambda prob: pd.Series(list(prob)))
df_pred_prob.columns = ['concrete_cement','healthy_metal','incomplete','irregular_metal','other']

In [11]:
df_pred_prob.head()

Unnamed: 0,concrete_cement,healthy_metal,incomplete,irregular_metal,other
0,7.5e-05,0.987796,0.000399,0.011648,8.3e-05
1,0.000733,0.47709,0.089818,0.431071,0.001289
2,0.007897,0.02154,0.401618,0.568818,0.000127
3,0.266794,0.001391,0.71677,0.014617,0.000429
4,0.000339,0.964012,0.001767,0.033751,0.00013


In [12]:
final = pd.concat([df_results, df_pred_prob], axis=1)
final = final.drop(columns=['pred_prob'])
final.head()

Unnamed: 0,id,concrete_cement,healthy_metal,incomplete,irregular_metal,other
0,7a44da50,7.5e-05,0.987796,0.000399,0.011648,8.3e-05
1,7a44dcf8,0.000733,0.47709,0.089818,0.431071,0.001289
2,7a44dd66,0.007897,0.02154,0.401618,0.568818,0.000127
3,7a44df46,0.266794,0.001391,0.71677,0.014617,0.000429
4,7a44dfb4,0.000339,0.964012,0.001767,0.033751,0.00013


Weirdly, the results have to be sorted by id in the same way as the example submission_format. We do this here:

In [13]:
submission_format = pd.read_csv(join('..', '..', 'data', 'submission_format.csv'))
final_sorted = final.set_index('id')
final_sorted = final_sorted.reindex(index=submission_format['id'])
final_sorted = final_sorted.reset_index()

In [14]:
final_sorted.to_csv(path_or_buf=join('..', '..', 'results', 'results_br_bs_m3.csv'), index=False)

In [None]:
final_sorted.shape