# Analisis Exploratorio 

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os

En esta competencia las imágenes se encuentran distribuidas de la siguiente manera: 

- <b> Train.csv:</b> Indica las url de la imagen, el id de la imagen y la clase (landmark_id). Son las imágenes a usar para el entrenamiento. 
- <b>Test.csv:</b> Indica la url de las Imagenes de consulta(Query). 
- <b>Index.csv:</b> Indica el banco de imágenes de donde se realizara la recuperación. 

#### Files 

In [2]:
# Read files
train_file_path = '../google-landmarks-dataset/train.csv'
index_file_path = '../google-landmarks-dataset/index.csv'
test_file_path = '../google-landmarks-dataset/test.csv'

df_train = pd.read_csv(train_file_path)
df_index = pd.read_csv(index_file_path)
df_test = pd.read_csv(test_file_path)

In [21]:
print("Training data size:", df_train.shape)
print("Training data columns:", df_train.columns)

Training data size: (1048575, 3)
Training data columns: Index(['id', 'url', 'landmark_id'], dtype='object')


In [22]:
print("Testing data size:", df_test.shape)
print("Testing data columns:", df_test.columns)

Training data size: (117703, 2)
Training data columns: Index(['id', 'url'], dtype='object')


In [23]:
print("Index data size:", df_index.shape)
print("Index data columns:", df_index.columns)

Index data size: (1048575, 2)
Index data columns: Index(['id', 'url'], dtype='object')


train dataset es el unico que tiene el landmark_id (clase)

¿Cuantas imágenes se descargaron finalmente?
Muchas imagenes de la URL no se encontraban disponibles en el momento que se intentaron descargar via URL. 

#### Missing data

In [3]:
missing = df_train.isnull().sum()

In [4]:
missing

id             0
url            0
landmark_id    0
dtype: int64

In [3]:
df_train

NameError: name 'df_train' is not defined

In [5]:
print('data is None:')
missing = df_train.isnull().sum()
percent = missing/df_train.count()
missing_train_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_train_data.head()


data is None:


Unnamed: 0,Missing,Percent
id,0,0.0
url,0,0.0
landmark_id,0,0.0


In [6]:
print('data is None:')
missing = df_index.isnull().sum()
percent = missing/df_index.count()
missing_index_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_index_data.head()


data is None:


Unnamed: 0,Missing,Percent
id,0,0.0
url,0,0.0


In [7]:
print('data is None:')
missing = df_test.isnull().sum()
percent = missing/df_test.count()
missing_test_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_test_data.head()


data is None:


Unnamed: 0,Missing,Percent
id,0,0.0
url,0,0.0


In [8]:
print('data is \'None\':')
missing = (df_train == 'None').sum()
percent = missing/df_train.count()
missing_train_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_train_data.head()



data is 'None':


Unnamed: 0,Missing,Percent
id,0,0.0
url,26983,0.025733
landmark_id,26983,0.025733


In [9]:
print('data is \'None\':')
missing = (df_index == 'None').sum()
percent = missing/df_index.count()
missing_index_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_index_data.head()


data is 'None':


Unnamed: 0,Missing,Percent
id,0,0.0
url,26889,0.025643


In [10]:
print('data is \'None\':')
missing = (df_test == 'None').sum()
percent = missing/df_test.count()
missing_test_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
missing_test_data.head()


data is 'None':


Unnamed: 0,Missing,Percent
id,0,0.0
url,5156,0.043805


In [11]:
df_train['landmark_id'].describe()


count     1048575
unique      14912
top          9633
freq        42336
Name: landmark_id, dtype: object

In [None]:
sns.set()
# plt.figure(figsize = (8, 5))
plt.title('Landmark_id Distribuition')
sns.distplot(df_train['landmark_id'])


In [None]:
# Training set: number of images per class(line plot)
sns.set()
plt.title('Training set: number of images per class(line plot)')
sns.set_color_codes("pastel")
landmarks_fold = pd.DataFrame(df_train['landmark_id'].value_counts())
landmarks_fold.reset_index(inplace=True)
landmarks_fold.columns = ['landmark_id','count']
ax = landmarks_fold['count'].plot(logy=True, grid=True)
locs, labels = plt.xticks()
plt.setp(labels, rotation=30)
ax.set(xlabel="Landmarks", ylabel="Number of images")


In [None]:
# Training set: number of images per class(scatter plot)
sns.set()
# plt.title('Training set: number of images per class')
landmarks_fold_sorted = pd.DataFrame(df_train['landmark_id'].value_counts())
landmarks_fold_sorted.reset_index(inplace=True)
landmarks_fold_sorted.columns = ['landmark_id','count']
landmarks_fold_sorted = landmarks_fold_sorted.sort_values('landmark_id')
ax = landmarks_fold_sorted.plot.scatter(\
     x='landmark_id',y='count',
     title='Training set: number of images per class(statter plot)')
locs, labels = plt.xticks()
plt.setp(labels, rotation=30)
ax.set(xlabel="Landmarks", ylabel="Number of images")



In [None]:
# Most frequent landmark_ids 
temp = pd.DataFrame(df_train.landmark_id.value_counts().head(10))
temp.reset_index(inplace=True)
temp.columns = ['landmark_id', 'count']
temp

Podria utilizar estos landmarks para  Hacer Talves un Subset para entrenar. 

In [None]:
# plt.figure(figsize=(9, 8))
plt.title('Most frequent landmarks')
sns.set_color_codes("pastel")
sns.barplot(x="landmark_id", y="count", data=temp,
            label="Count")
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
plt.show()


In [None]:
# Least frequent landmark_ids
temp = pd.DataFrame(df_train.landmark_id.value_counts().tail(10))
temp.reset_index(inplace=True)
temp.columns = ['landmark_id', 'count']
temp


In [None]:
# Extract site_names for train data
temp_list = list()
for path in df_train['url']:
    temp_list.append((path.split('//', 1)[1]).split('/', 1)[0])
df_train['site_name'] = temp_list
data_sources = pd.DataFrame(df_train['site_name'].value_counts())
data_sources.reset_index(inplace=True)
data_sources.columns = ['site_name', 'count']
data_sources.head()
