# Google Landmark Classification

### Downloading the dataset
Used the script available on Kaggle

#### Import the required libraries

In [10]:
import pandas as pd
import cv2 as cv
import os
import random
import numpy as np

In [12]:
from keras.preprocessing.image import img_to_array
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten, ZeroPadding2D, BatchNormalization, Activation
from keras.initializers import glorot_uniform
from keras.utils import to_categorical

Using TensorFlow backend.


ImportError: No module named 'tensorflow'

#### Function to load the data 

The function *load_data* takes the list of images from the *train.csv* file and shuffles them and searches them in the *train* directory to check if they have been downloaded correctly. If so, the list of the files are then saved to list *img_list* and split into 'train' and 'dev' sets.   

In [13]:
def load_data(train_folder, train_file, train_percent):

    img_list = []                     # to hold the list of images names downloaded
    img_labels = []                   # labels of the downloaded images
    train_data = pd.read_csv(train_file).values

    img_list_orig = train_data[:, 0]  # image name is the first column in the csv file

    random.seed(42)
    random.shuffle(img_list)          # just shuffling the list  
    
    for img_name in img_list_orig:
        
        file_path = os.path.join(train_folder, img_name + '.jpg')
                
        if os.path.exists(file_path):                # check if the image exists
            img_list.append(img_name)      # add the image to the image list    
            
    img_list_train = img_list[0:int(train_percent * img_list_orig.shape[0])] 
    img_list_test = img_list[int(train_percent * img_list_orig.shape[0]) + 1:]

    return train_data, img_list_train, img_list_test

    # print(img_list[0:10])

    # for img_name in img_list:

    #     img = cv.imread(os.path.join(train_folder, img_name + '.jpg'))

    #     if img is not None:
    #         img = cv.resize(img, (100, 100))
    #         img = img_to_array(img)
    #         X.append(img)

    #         for i in range(train_data.shape[0]):
    #             if img_name == train_data[i, 0]:
    #                 label = train_data[i, 2]
    #                 Y.append(label)    

    # X = np.array(X, dtype='float32') / 255.0
    # Y = np.array(Y)

    # print(X.shape, Y.shape)        

In [14]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

ImportError: No module named 'tensorflow'

In [None]:
train_file = 'train.csv'
train_folder = '/mnt/disks/dataset/train'

train_data, img_list_train, img_list_test = load_data(train_folder, train_file, train_percent = 0.98)

In [None]:
print(len(img_list_train), len(img_list_test))

In [None]:
def load_batch(batch_no, train_data, img_list_train, folder, batch_size):
    
    X_batch = []
    Y_batch = []
    iterations = int(len(img_list_train) / batch_size)

    img_list_batch = img_list_train[batch_no*batch_size:(batch_no+1)*batch_size]
        
    for img_name in img_list_batch:
        img = cv.imread(os.path.join(train_folder, img_name + '.jpg'))

        if img is not None:
            img = cv.resize(img, (500, 500))
            img = img_to_array(img)

            for j in range(len(img_list_train)):
                if img_name == train_data[j, 0]:
                    label = train_data[j, 2]
                    X_batch.append(img)
                    Y_batch.append(label)  
                    
    X_batch = np.array(X_batch, dtype = 'float32') / 255.0 
    Y_batch = np.array(Y_batch)
        
    return X_batch, Y_batch

In [None]:
first_X, first_Y = load_batch(1239, train_data, img_list_train, train_folder, batch_size = 500)

In [None]:
def ConvModel(input_shape = (300, 300, 3), classes = 15000):
    
    X_input = Input(input_shape)
    
    X = ZeroPadding2D((3, 3))(X_input)
    
    X = Conv2D(64, (11, 11), strides = (2, 2), name = 'conv1', kernel_initializer=glorot_uniform(seed = 0))(X)
    X = BatchNormalization(axis = 3, name = 'bn_conv1')(X)
    X = Activation('relu')(X)
    X = MaxPooling2D((3, 3), strides = (2, 2))(X)
    
    X = Flatten()(X)
    X = Dense(1, activation = 'sigmoid', name = 'fc')(X)
    
    model = Model(inputs = X_input, outputs = X, name = 'SmallConv')
    
    return model

In [None]:
model = ConvModel(input_shape = (300, 300, 3), classes = 15000)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
batch_size = 500
test_folder = 'test'
iteration_no = 1 #len(img_list_train) / batch_size

for iteration in range(iteration_no):
    X_train, Y_train_orig = load_batch(iteration, train_data, img_list_train, train_folder, batch_size)
    X_test, Y_test_orig = load_batch(iteration, train_data, img_list_test, test_folder, batch_size)
    
    # Convert training and test labels to one hot matrices
    Y_train = to_categorical(Y_train_orig, 15000)
    Y_test = to_categorical(Y_test_orig, 15000)
    
    print ("number of training examples = " + str(X_train.shape[0]))
    print ("number of test examples = " + str(X_test.shape[0]))
    print ("X_train shape: " + str(X_train.shape))
    print ("Y_train shape: " + str(Y_train.shape))
    print ("X_test shape: " + str(X_test.shape))
    print ("Y_test shape: " + str(Y_test.shape))