# Calculating features

In [1]:
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input

import numpy as np
import pandas as pd
from os.path import join
import os

Using TensorFlow backend.


In [2]:
regions = ['borde_rural', 'borde_soacha', 'mixco_1_and_ebenezer', 'mixco_3']

In [3]:
region = 'dennery'
train_dir = join('..', '..', 'data', region, 'roofs_train')
test_dir = join('..', '..', 'data', region, 'roofs_test')
materials = {'concrete_cement':0, 'healthy_metal':1, 'incomplete':2, 'irregular_metal':3, 'other':4}

Use a pretrained network to compute features from the images, which are later classified. Import the model from the Kerase library and remove the top layer by setting `include_top=False`. The last layer is pooled such that we get outputs of size 1x2048 instead of 7x7x2048. The pooling method `'max'` is chosen empirically.

In [4]:
model = ResNet50(weights='imagenet', include_top=False, pooling='max')
model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None, None, 6 256         conv1[0][0]                      
___________________________________________________________________________________________

Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
__________________________________________________________________________________________________


## 1. Calculate features on the training data

The preprocessed data is stored in such a way, that every region has their own folder, that contains a subfolder defined as `train_dir` above. This folder again is subdivided by material. Images carry their `id` as filename.

Traverse all images in their corresponding folders which indicate the material and use `model.predict()` to get the features.

In [5]:
def calculate_features_train(train_dir):
    
    # Initialize a dataframe for the features    
    columns = ['id', 'features', 'label']
    df_features = pd.DataFrame(columns=columns)
    
    data = []

    # Walk through all images
    for material in materials.keys():
        material_fp = join(train_dir, material)
        for root, dirs, files in os.walk(material_fp):
            for file in files:
                img_fp = join(material_fp, file)
                label = materials[material]
                id = file.split('.')[0]
                print(id, "labeled as", material, ":", label)

                # Pad if size is too small, preprocess
                img = image.load_img(img_fp, target_size=(224, 224))
                img_data = image.img_to_array(img)
                img_data = np.expand_dims(img_data, axis=0)
                img_data = preprocess_input(img_data)

                # Compute features 
                resnet50_feature = model.predict(img_data)
                resnet50_feature_np = np.array(resnet50_feature).flatten()

                data.append({'id': id ,
                             'features': resnet50_feature_np,
                             'label': label})
                #df_features = df_features.append({'id': id ,
                #                                  'features': resnet50_feature_np,
                #                                  'label': label}, ignore_index=True)
    
    df_features = pd.DataFrame(data)
    return df_features

The results are stored in a dataframe as shown below and saved to disk as a pickle file.

In [6]:
features_train = calculate_features_train(train_dir)

7a1e2ea0 labeled as concrete_cement : 0
7a2257a0 labeled as concrete_cement : 0
7a22d0d6 labeled as concrete_cement : 0
7a24e786 labeled as concrete_cement : 0
7a2556e4 labeled as concrete_cement : 0
7a26c5b0 labeled as concrete_cement : 0
7a29f866 labeled as concrete_cement : 0
7a2a8c22 labeled as concrete_cement : 0
7a2c07aa labeled as concrete_cement : 0
7a2ef2bc labeled as concrete_cement : 0
7a320254 labeled as concrete_cement : 0
7a331540 labeled as concrete_cement : 0
7a338f3e labeled as concrete_cement : 0
7a3542ca labeled as concrete_cement : 0
7a35781c labeled as concrete_cement : 0
7a357b6e labeled as concrete_cement : 0
7a3b3cfc labeled as concrete_cement : 0
7a3b84e6 labeled as concrete_cement : 0
7a3b85b8 labeled as concrete_cement : 0
7a3bc3ac labeled as concrete_cement : 0
7a3c0d08 labeled as concrete_cement : 0
7a3d295e labeled as concrete_cement : 0
7a3e6ad0 labeled as concrete_cement : 0
7a3eac84 labeled as concrete_cement : 0
7a3fa896 labeled as concrete_cement : 0


7a25e104 labeled as healthy_metal : 1
7a25fe5a labeled as healthy_metal : 1
7a260d82 labeled as healthy_metal : 1
7a261bc4 labeled as healthy_metal : 1
7a2622ae labeled as healthy_metal : 1
7a2655e4 labeled as healthy_metal : 1
7a265652 labeled as healthy_metal : 1
7a2659ae labeled as healthy_metal : 1
7a2687c6 labeled as healthy_metal : 1
7a268b90 labeled as healthy_metal : 1
7a268fb4 labeled as healthy_metal : 1
7a26922a labeled as healthy_metal : 1
7a269b76 labeled as healthy_metal : 1
7a269d1a labeled as healthy_metal : 1
7a26acd8 labeled as healthy_metal : 1
7a26bf52 labeled as healthy_metal : 1
7a26c09c labeled as healthy_metal : 1
7a26c2b8 labeled as healthy_metal : 1
7a26cac4 labeled as healthy_metal : 1
7a26d1d6 labeled as healthy_metal : 1
7a26d820 labeled as healthy_metal : 1
7a26e1bc labeled as healthy_metal : 1
7a26edba labeled as healthy_metal : 1
7a26fa30 labeled as healthy_metal : 1
7a2712a4 labeled as healthy_metal : 1
7a271524 labeled as healthy_metal : 1
7a271f10 lab

7a2f6cd8 labeled as healthy_metal : 1
7a2f76ce labeled as healthy_metal : 1
7a2f7f7a labeled as healthy_metal : 1
7a2f825e labeled as healthy_metal : 1
7a2f8a9c labeled as healthy_metal : 1
7a2f942e labeled as healthy_metal : 1
7a2f992e labeled as healthy_metal : 1
7a2f9fc8 labeled as healthy_metal : 1
7a2fa798 labeled as healthy_metal : 1
7a2fad06 labeled as healthy_metal : 1
7a2fc0c0 labeled as healthy_metal : 1
7a2fd06a labeled as healthy_metal : 1
7a2fd1aa labeled as healthy_metal : 1
7a2ff5f4 labeled as healthy_metal : 1
7a2ff874 labeled as healthy_metal : 1
7a2fffe0 labeled as healthy_metal : 1
7a3000bc labeled as healthy_metal : 1
7a300198 labeled as healthy_metal : 1
7a300918 labeled as healthy_metal : 1
7a300b48 labeled as healthy_metal : 1
7a300fda labeled as healthy_metal : 1
7a301048 labeled as healthy_metal : 1
7a30281c labeled as healthy_metal : 1
7a303ad2 labeled as healthy_metal : 1
7a304b44 labeled as healthy_metal : 1
7a304fcc labeled as healthy_metal : 1
7a305698 lab

7a3789f4 labeled as healthy_metal : 1
7a378fb2 labeled as healthy_metal : 1
7a37985e labeled as healthy_metal : 1
7a3798c2 labeled as healthy_metal : 1
7a379e26 labeled as healthy_metal : 1
7a37a038 labeled as healthy_metal : 1
7a37a362 labeled as healthy_metal : 1
7a37a650 labeled as healthy_metal : 1
7a37ab50 labeled as healthy_metal : 1
7a37ac22 labeled as healthy_metal : 1
7a37b582 labeled as healthy_metal : 1
7a37d8aa labeled as healthy_metal : 1
7a37db3e labeled as healthy_metal : 1
7a37e7c8 labeled as healthy_metal : 1
7a37f24a labeled as healthy_metal : 1
7a382080 labeled as healthy_metal : 1
7a3823a0 labeled as healthy_metal : 1
7a382706 labeled as healthy_metal : 1
7a38463c labeled as healthy_metal : 1
7a3846aa labeled as healthy_metal : 1
7a386482 labeled as healthy_metal : 1
7a3874c2 labeled as healthy_metal : 1
7a387b84 labeled as healthy_metal : 1
7a387f4e labeled as healthy_metal : 1
7a389380 labeled as healthy_metal : 1
7a38a906 labeled as healthy_metal : 1
7a38bc98 lab

7a402a0a labeled as healthy_metal : 1
7a403856 labeled as healthy_metal : 1
7a404b20 labeled as healthy_metal : 1
7a4055d4 labeled as healthy_metal : 1
7a406862 labeled as healthy_metal : 1
7a406dda labeled as healthy_metal : 1
7a408176 labeled as healthy_metal : 1
7a408fa4 labeled as healthy_metal : 1
7a409300 labeled as healthy_metal : 1
7a40b272 labeled as healthy_metal : 1
7a40b696 labeled as healthy_metal : 1
7a40cf64 labeled as healthy_metal : 1
7a40d036 labeled as healthy_metal : 1
7a40e364 labeled as healthy_metal : 1
7a40fae8 labeled as healthy_metal : 1
7a410038 labeled as healthy_metal : 1
7a41038a labeled as healthy_metal : 1
7a411898 labeled as healthy_metal : 1
7a4121bc labeled as healthy_metal : 1
7a412c8e labeled as healthy_metal : 1
7a414a66 labeled as healthy_metal : 1
7a4151dc labeled as healthy_metal : 1
7a41638e labeled as healthy_metal : 1
7a416532 labeled as healthy_metal : 1
7a41894a labeled as healthy_metal : 1
7a419aa2 labeled as healthy_metal : 1
7a41b82a lab

In [7]:
features_train.head()

Unnamed: 0,id,features,label
0,7a1e2ea0,"[4.3347397, 2.9863954, 0.46881634, 1.6075785, ...",0
1,7a2257a0,"[8.629524, 4.150274, 0.0, 0.102268636, 0.44596...",0
2,7a22d0d6,"[6.7039595, 5.165891, 0.5288703, 1.8951656, 5....",0
3,7a24e786,"[2.4254084, 4.603034, 0.0, 0.4341668, 5.365196...",0
4,7a2556e4,"[7.577344, 4.1386714, 0.69517213, 0.38018358, ...",0


In [8]:
import pickle
from os import makedirs
from os.path import exists

pickle_path = join('..', '..', 'pickles')
if not exists(pickle_path):
    makedirs(pickle_path)

with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl' ), 'wb') as f:
    pickle.dump(features_train, f)

### Visualize the features

Reduce the dimensionality to 2D using tSNE (https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding)

In [30]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
from sklearn.decomposition import PCA
%matplotlib inline

In [11]:
def plot_tSNE(features, labels=None, number_of_materials=5):
    if labels is None:
        labels = np.zeros((features.shape[0]))

    # Visualization_
    #pca_object = PCA(n_components=50)
    #pca_features = pca_object.fit_transform(features)
    tsne_features = TSNE(n_components=2).fit_transform(features)

    # define the colormap
    cmap = plt.cm.jet
    # extract all colors from the .jet map
    cmaplist = [cmap(i) for i in range(cmap.N)]
    # create the new map
    cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)

    # define the bins and normalize
    bounds = np.linspace(0, number_of_materials, number_of_materials + 1)
    norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)
    
    print(tsne_features.shape)

    plt.figure()
    scat = plt.scatter(tsne_features[:, 0], tsne_features[:, 1], c=labels, cmap=cmap, norm=norm)
    cb = plt.colorbar(scat, spacing='proportional', ticks=bounds)

Convert features within the dataframe to a single matrix

In [12]:
feat_matrix = features_train['features'].to_numpy()
feat_matrix = np.column_stack(feat_matrix).transpose()

labels = features_train['label'].to_numpy()

In [13]:
plot_tSNE(feat_matrix, labels)

KeyboardInterrupt: 

## 2. Calculate features on the test data


This procedure is similar as for the training data, except that we do not know the labels and the folder with test images is not subdivided by material

In [21]:
def calculate_features_test(test_dir):
    
    # Initialize a dataframe for the features    
    columns = ['id', 'features']
    df_features = pd.DataFrame(columns=columns)

    # Walk through all images
    for root, dirs, files in os.walk(test_dir):
        for file in files:
            img_fp = join(root, file)
            id = file.split('.')[0]
            print("Calculate features for", id)

            # Pad if size is too small, preprocess
            img = image.load_img(img_fp, target_size=(224, 224))
            img_data = image.img_to_array(img)
            img_data = np.expand_dims(img_data, axis=0)
            img_data = preprocess_input(img_data)

            # Compute features 
            resnet50_feature = model.predict(img_data)
            resnet50_feature_np = np.array(resnet50_feature).flatten()

            df_features = df_features.append({'id': id ,
                                              'features': resnet50_feature_np},
                                             ignore_index=True)

    return df_features

In [22]:
features_test = calculate_features_test(test_dir)

Calculate features for 7a44e1d0
Calculate features for 7a44e4c8
Calculate features for 7a44e7b6
Calculate features for 7a44e96e
Calculate features for 7a44e9dc
Calculate features for 7a44ecde
Calculate features for 7a44f4c2
Calculate features for 7a44f5f8
Calculate features for 7a44f968
Calculate features for 7a44fbd4
Calculate features for 7a44fe4a
Calculate features for 7a4506b0
Calculate features for 7a450f8e
Calculate features for 7a4516aa
Calculate features for 7a4522ee
Calculate features for 7a452708
Calculate features for 7a45311c
Calculate features for 7a4545f8
Calculate features for 7a454de6
Calculate features for 7a455be2
Calculate features for 7a455f3e
Calculate features for 7a455fac
Calculate features for 7a456452
Calculate features for 7a4572a8
Calculate features for 7a45813a
Calculate features for 7a458c98
Calculate features for 7a4591ca
Calculate features for 7a459ce2
Calculate features for 7a459fee
Calculate features for 7a45a78c
Calculate features for 7a45a7fa
Calculat



Calculate features for 7a4af85e
Calculate features for 7a4afa70
Calculate features for 7a4afe80
Calculate features for 7a4b063c
Calculate features for 7a4b0f38
Calculate features for 7a4b1226
Calculate features for 7a4b149c
Calculate features for 7a4b15d2
Calculate features for 7a4b2630
Calculate features for 7a4b2702
Calculate features for 7a4b2ab8
Calculate features for 7a4b3152
Calculate features for 7a4b35bc
Calculate features for 7a4b3e86
Calculate features for 7a4b4174
Calculate features for 7a4b46c4
Calculate features for 7a4b493a
Calculate features for 7a4b5448
Calculate features for 7a4b55ec
Calculate features for 7a4b56be
Calculate features for 7a4b61ea
Calculate features for 7a4b6bb8
Calculate features for 7a4b7bc6
Calculate features for 7a4b7cfc
Calculate features for 7a4b8396
Calculate features for 7a4b83fa
Calculate features for 7a4b89f4
Calculate features for 7a4b8c60
Calculate features for 7a4b9142
Calculate features for 7a4b976e
Calculate features for 7a4b9a48
Calculat

Calculate features for 7a503e4a
Calculate features for 7a504534
Calculate features for 7a504a20
Calculate features for 7a504ffc
Calculate features for 7a5055c4
Calculate features for 7a5056fa
Calculate features for 7a5058a8
Calculate features for 7a505f88
Calculate features for 7a5062c6
Calculate features for 7a50646a
Calculate features for 7a507e1e
Calculate features for 7a507e8c
Calculate features for 7a50815c
Calculate features for 7a509264
Calculate features for 7a50b47e
Calculate features for 7a50b6f4
Calculate features for 7a50c540
Calculate features for 7a50c748
Calculate features for 7a50d24c
Calculate features for 7a50e156
Calculate features for 7a50e980
Calculate features for 7a50f222
Calculate features for 7a50f510
Calculate features for 7a50f984
Calculate features for 7a50fa56
Calculate features for 7a50ff9c
Calculate features for 7a5112f2
Calculate features for 7a5115d6
Calculate features for 7a51224c
Calculate features for 7a513070
Calculate features for 7a5139c6
Calculat

In [23]:
features_test.head()

Unnamed: 0,id,features
0,7a44e1d0,"[6.2197113, 2.110661, 1.13708, 0.0, 0.0, 0.504..."
1,7a44e4c8,"[1.2535996, 0.0, 0.0, 0.15350294, 0.49344027, ..."
2,7a44e7b6,"[3.1035824, 3.8894126, 2.2028537, 0.0, 0.0, 2...."
3,7a44e96e,"[3.6370928, 4.50314, 2.9926593, 1.0457133, 1.0..."
4,7a44e9dc,"[0.5092753, 4.6801777, 1.3032107, 0.0, 0.0, 0...."


In [24]:
with open(join(pickle_path, 'resnet50_features_' + region + '_test.pkl' ), 'wb') as f:
    pickle.dump(features_test, f)