# Building Invasive Species Detector

Because scientists cannot sample a large quantity of areas, some machine learning algorithms are used in order to predict the presence or absence of invasive species in areas that have not been sampled. The accuracy of this approach is far from optimal, but still contributes to approaches to solving ecological problems.
This is developing a model that can identify whether there is invasice hydrangea or not in the photo of forests. [Hydrangea](https://en.wikipedia.org/wiki/Hydrangea) is one kind of flowers native to southern and eastern Asia (China, Japan, Taiwan, Korea, the Himalayas, and Indonesia) and the Americas. 
![img](https://upload.wikimedia.org/wikipedia/commons/7/7a/Hydrangeas_corner.JPG)

### **Reference**
* [Kernel by Bukan](https://www.kaggle.com/ambarish/invasive-species-monitoring-analysis)
* [Kernel by Luis Bronchal](https://www.kaggle.com/lbronchal/keras-pre-trained-vgg16-kaggle-runnable-version)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
from tqdm import tqdm
from skimage.transform import resize

from keras.models import Model
from keras.layers import Flatten, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator

from keras.applications.resnet50 import ResNet50

## 1. Upload data

In [None]:
df = pd.read_csv('../input/train_labels.csv')
df.head()

In [None]:
len(df)

Let's take a glimpse of the name list of train file

In [None]:
# File list check
path = '../input/train/'
os.listdir(path)[:10]

Now why don't we see the what kind of picture we have in the file?

In [None]:
# show the 10 images
fig, axs = plt.subplots(nrows = 2, ncols = 5, figsize = (20, 20))

for i in range(10):
    filename = path + str(i + 1) + '.jpg'
    img = plt.imread(filename)
    ax = axs[i//5, i%5]
    ax.imshow(img)
    ax.axis('off')
    plt.tight_layout()

Hello, Hydrangea. Let's see the shape of images first

In [None]:
# Image size check
img.shape

We need to reshape this image into a proper size before applying. How can we reshape this image then? We can simply use skimage library

In [None]:
# Reshape the image
im_size = 256

img_2 = resize(img, (im_size, im_size, 3))
img_2.shape

In [None]:
# Compare the result
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2)
ax1.imshow(img)
ax2.imshow(img_2)

## 2. Data proprocessing

Let's apply all these process into entire dataset!

In [None]:
# Upload images
X = []                                         # initialize a list 

for i in range(len(df)):
    filename = path + str(i+1) + '.jpg'
    img = plt.imread(filename) 
    img = resize(img, (im_size, im_size, 3))         # reshape the iamge files
    img_array = np.array(img) / 255.                        # change image file into numpy array
    X.append(img_array)

In [None]:
# Shuffle randomly
random_num = np.random.permutation(len(df))
y = df.invasive

X_shuffle = []
y_shuffle = []
for i in range(len(df)):
    X_shuffle.append(X[random_num[i]])
    y_shuffle.append(y[random_num[i]])

In [None]:
X_shuffle[2].shape

In [None]:
print("The size of each sample data is {}".format(X_shuffle[1].shape))
print("And the number of samples are {}".format(len(X_shuffle)))

In [None]:
# Transform into numpy array
X_shuffle = np.array(X_shuffle)
y_shuffle = np.array(y_shuffle)

# Split the data into train and valid set
split = int(len(df)*.2)

X_val = X_shuffle[:split]
X_train = X_shuffle[split:]
y_val = y_shuffle[:split]
y_train = y_shuffle[split:]

In [None]:
# Check the result
print("The size of X_train shape is {}".format(X_train.shape))
print("The size of X_val shape is {}".format(X_val.shape))
print("The size of y_train shape is {}".format(y_train.shape))
print("The size of y_val shape is {}".format(y_val.shape))

## 3. Building - baseline: ResNet50

In [None]:
# Pre-trained model application
base_model = ResNet50(weights = None, include_top = False, input_shape=(im_size, im_size, 3))

In [None]:
base_model.output_shape

In [None]:
# modeling
x = base_model.output

x = Flatten()(x)
x = Dropout(.5)(x)
pred = Dense(output_dim = 1, activation = 'sigmoid')(x)

model = Model(base_model.input, pred)

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
# Layer training set
for layer in base_model.layers:
    layer.trainable = False

In [None]:
# Early Stop
stopper = EarlyStopping(patience = 2)

## 4. Image Augmentation

In [None]:
batch_size = 50
epochs = 10

In [None]:
path

In [None]:
train_gen = ImageDataGenerator(preprocessing_function = preprocess_input,
                               shear_range = 0.2,
                               zoom_range = 0.2,
                               horizontal_flip = True)
train_generator = train_gen.flow_from_directory('../input/train',
                                                target_size = (im_size, im_size),
                                                batch_size = 32,
                                                class_mode ='binary')

In [None]:
val_gen = ImageDataGenerator(preprocessing_function = preprocess_input)
val_generator = val_gen.flow_from_directory('../input/train',
                                            target_size = (im_size, im_size),
                                            class_mode = 'binary')

In [None]:
# Fitting
history = model.fit_generator(train_generator,
                              steps_per_epoch= X_train.shape[0] // batch_size,
                              epochs = epochs,
                              validation_data = val_generator,
                              validation_steps = X_val.shape[0] // batch_size)

## 5. Evaluation

In [None]:
# Train and validation curves with ResNet50
fig, (ax1, ax2) = plt.subplots(2, 1)
ax1.plot(history.history['loss'], color = 'b', label = 'Train Loss')
ax1.plot(history.history['val_loss'], color = 'm', label = 'Valid Loss')
ax1.legend(loc = 'best')

ax2.plot(history.history['acc'], color = 'b', label = 'Train Accuracy')
ax2.plot(history.history['val_acc'], color = 'm', label = 'Valid Accuracy')
ax2.legend(loc = 'best')

## 6. Submission

In [None]:
sample = pd.read_csv("../input/sample_submission.csv")
sample.head()

In [None]:
img_path = "../input/test/"
test_names = []
file_paths = []

for i in range(len(sample_submission)):
    test_names.append(sample_submission.iloc[i][0])
    file_paths.append( img_path + str(int(sample_submission.iloc[i][0])) +'.jpg' )
    
test_names = np.array(test_names)

In [None]:
test_images = test_images.astype('float32')
test_images /= 255

In [None]:
predictions = model.predict(test_images)

sample_submission = pd.read_csv("../input/sample_submission.csv")

for i, name in enumerate(test_names):
    sample_submission.loc[sample_submission['name'] == name, 'invasive'] = predictions[i]

sample_submission.to_csv("submit.csv", index=False)