In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#          print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from collections import Counter
from IPython.display import display
import operator
import random
import matplotlib.pyplot as plt

import cv2

In [3]:
train = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
submission = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
submission

## Data Cleansing
- species TYPO correction, integrate some similar species
    - https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305468
    - pilot_whale and globis are both short_finned_pilot_whale and thus the three can be merged.
    - there are two misspellings:
        - bottlenose_dolphin and bottlenose_dolpin
        - killer_whale and kiler_whale

In [4]:
df_species = sorted(train.species.unique().tolist())
df_species

In [5]:
train['species'].replace(['beluga', 'bottlenose_dolpin', 'pilot_whale',
                          'globis', 'kiler_whale'],
                         ['beluga_whale', 'bottlenose_dolphin', 'short_finned_pilot_whale',
                          'short_finned_pilot_whale', 'killer_whale'],
                         inplace=True)

train['class'] = [species.split('_')[-1] for species in train['species']]

TRAIN_PATH = "../input/happy-whale-and-dolphin/train_images/"
TEST_PATH = "../input/happy-whale-and-dolphin/test_images/"
train['img_path'] = TRAIN_PATH + train['image']

In [6]:

print(f"train dataframe image 컬럼 개수: {train.image.nunique()}\n")
print(f"train_images 폴더 내 이미지 개수: {len(os.listdir(TRAIN_PATH))}\n")
print(f"test_images 폴더 내 이미지 개수: {len(os.listdir(TEST_PATH))}\n")
print(f"train dataframe individual_id 컬럼 개수: {train.individual_id.nunique()}\n")
print(f"submission dataframe image 컬럼 개수: {submission.image.nunique()}\n")

In [7]:
cnts = Counter(train['species'])
cnts = sorted(cnts.items(), key = operator.itemgetter(1))

plt.figure(figsize=(10, 7))
plt.barh([item[0] for item in cnts],
        [item[1] for item in cnts])
# plt.xticks(rotation=90)
plt.show()

### Distribution of Whales and Dolphins

In [8]:
dolphin_cnt = len(train[train['class'] == 'dolphin'])
whale_cnt = len(train[train['class'] == 'whale'])
total_cnt = len(train['class'])

print(dolphin_cnt, whale_cnt, total_cnt)
print(f"돌고래:고래 = {round(dolphin_cnt/total_cnt*100, 2)}:{round(whale_cnt/total_cnt*100, 2)}")

In [9]:
fig, ax = plt.subplots(figsize=(16, 8))
fig.suptitle('Whales and Dolphins', size=20, font='Serif')
explode=(0.05, 0.05)
labels = list(train['class'].value_counts().index)
sizes = train['class'].value_counts().values

ax.pie(sizes, explode=explode, startangle=60, labels=labels, autopct="%1.0f%%",
       pctdistance=0.7, colors=["#0077b6","#90e0ef"])
ax.add_artist(plt.Circle((0, 0), 0.4, fc='white'))
plt.show()

It seems we also have a data imbalance across different class in our dataset. It can be helpful when we split the dataset for model training.

In [10]:
whale_species_cnt = train[train['class'] == 'whale']['species'].value_counts()
dolphin_species_cnt = train[train['class'] == 'dolphin']['species'].value_counts()

df_whale_species = pd.DataFrame(whale_species_cnt).reset_index().rename(columns={'index':'species',
                                                                                 'species':'counts'})
df_dolphin_species = pd.DataFrame(dolphin_species_cnt).reset_index().rename(columns={'index':'species',
                                                                                     'species':'counts'})
df_whale_species.sort_values(by=['counts'], inplace=True)
df_dolphin_species.sort_values(by=['counts'], inplace=True)

display(df_whale_species)
display(df_dolphin_species)

In [11]:
fig, axes = plt.subplots(2,1, figsize=(15, 8))
axes[0].barh(df_whale_species['species'], df_whale_species['counts'])
axes[0].set_xlim(0, 12000)
axes[1].barh(df_dolphin_species['species'], df_dolphin_species['counts'])
axes[1].set_xlim(0, 12000)
fig.tight_layout()

In [12]:
for i in range(3):
    temp_path = np.random.choice(train['img_path'])
    im = plt.imread(temp_path)
    plt.figure(figsize=(7, 7))
    plt.imshow(im)
    plt.title(temp_path.split('_')[-1])
    plt.xticks([])
    plt.yticks([])
    display(train[train['img_path'] == temp_path])
    plt.show()

In [13]:
df = pd.DataFrame(train['image'].groupby([train['individual_id']]).count())
df.columns=['image_cnts']
df.sort_values(by='image_cnts', ascending=False)

individual_id
- individual_id가 중복되는 1개인 것만 있는 것이 아니라 400장 가까이 있는 것도 있음. 
- 특정 종에 대해서는 이미 많은 데이터가 수집되어 있고, 한 장씩 밖에 없는 사진도 존재하는 것 같음

In [14]:
img = cv2.imread(train['img_path'][0],-1)
plt.imshow(img)

img = cv2.imread(file_path, flag)
- flag 0 = cv2.IMREAD_COLOR : 이미지 파일을 Color로 읽어들입니다. 투명한 부분은 무시되며, Default값입니다.
- flag 1 = cv2.IMREAD_GRAYSCALE : 이미지를 Grayscale로 읽어 들입니다. 실제 이미지 처리시 중간단계로 많이 사용합니다.
- flag -1 = cv2.IMREAD_UNCHANGED : 이미지파일을 alpha channel까지 포함하여 읽어 들입니다.

The shape method on img that read by cv2.imread returns 3dim array.
- (Y, X, C)
    - Y: 행
    - X: 열
    - C: 색을 표현하는 BGR

In [15]:
img.shape