In [1]:
import numpy as np
import pandas as pd

import csv

# Loading Files

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
#rec_sol_df = pd.read_csv('data/recognition_solution_v2.1.csv')

In [3]:
train_df = train_df.replace(to_replace='None', value=np.nan).dropna()

In [4]:
train_df.head()

Unnamed: 0,id,url,landmark_id
0,97c0a12e07ae8dd5,http://lh4.ggpht.com/-f8xYA5l4apw/RSziSQVaABI/...,6347
1,650c989dd3493748,https://lh5.googleusercontent.com/-PUnMrX7oOyA...,12519
2,05e63ca9b2cde1f4,http://mw2.google.com/mw-panoramio/photos/medi...,264
3,08672eddcb2b7c93,http://lh3.ggpht.com/-9fgSxDYwhHA/SMvGEoltKTI/...,13287
4,fc49cb32ef7f1e89,http://lh6.ggpht.com/-UGAXxvPbr98/S-jGZbyMIPI/...,4018


# Preprocessing

In [37]:
NUM_IMG = 10000 # choose classes with number of images greater than this number
sampling_rate = 0.01 # fraction of images from each landmark_id
random_state = 17 # for reproducibility

# frequency of landmarks
landmarks = train_df.groupby(by='landmark_id').count().loc[:,'id'].sort_values(ascending=False)
num_classes = landmarks[landmarks.values > NUM_IMG].shape[0]
landmarks

landmark_id
9633     49531
6051     49319
6599     22817
9779     18099
2061     13023
         ...  
5043         1
11333        1
8309         1
5745         1
10556        1
Name: id, Length: 14947, dtype: int64

## Sampling

In [38]:
top_rank = landmarks.index[:num_classes]
len(top_rank)

6

In [39]:
topRank_filter = train_df['landmark_id'].isin(top_rank)
topRank_train = train_df[topRank_filter]

topRank_train.shape

(163618, 3)

In [40]:
# sample training images from the 100 most frequent image classes
sample_gby = topRank_train.groupby(by='landmark_id', as_index = False).apply(lambda x: x.sample(frac=sampling_rate, random_state=random_state))
img_id_list = sample_gby['id'].tolist()

train_sample = topRank_train.loc[topRank_train['id'].isin(img_id_list)]

train_sample.shape

(1635, 3)

In [41]:
#write to csv file
train_sample.to_csv('data/train_sample_temp.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
train_sample_df = pd.read_csv('data/train_sample_temp.csv')

In [42]:
train_sample_df.shape

(1635, 3)

In [43]:
train_sample_df[train_sample_df.id == '0b7ca98b7ff8c0cd']

Unnamed: 0,id,url,landmark_id


In [44]:
print("Total training data size:", train_df.shape[0])
print("Total test data size:", test_df.shape[0])
print("Total number of unique landmark_id's:", len(landmarks))

print("Sampled training data size:", train_sample.shape[0])
print("Sampled unique landmark_id's:", len(train_sample["landmark_id"].unique()))

Total training data size: 1193541
Total test data size: 117703
Total number of unique landmark_id's: 14947
Sampled training data size: 1635
Sampled unique landmark_id's: 6


In [45]:
train_sample.head()

Unnamed: 0,id,url,landmark_id
252,378e26d47fc10897,https://lh3.googleusercontent.com/-L3kzpgPaAeM...,6599
341,186f304309ea56b7,https://lh5.googleusercontent.com/-1PmqYO7TznA...,9779
1038,4570c7584fdc2135,https://lh4.googleusercontent.com/-Pgy374sfPFU...,6051
2717,4a5c56cc427238e8,https://lh6.googleusercontent.com/-QC_STlmhe_4...,6599
3098,aa7cee74dae9aa69,https://lh5.googleusercontent.com/-FbWVGTGL_zQ...,9633
