# Scraping

## Functions

In [None]:
from bs4 import BeautifulSoup
import requests
from PIL import Image
import os
import uuid

PATH_TO_SAVE = "data/raw/Rembrandt"


In [None]:
def check_dir(dir:str):
    if not os.path.isdir(dir):
        os.makedirs(dir)

def download_image(pic_url:str, saveName:str):
    with open(saveName, 'wb') as handle:
        response = requests.get(pic_url, stream=True)
        if not response.ok:
            pass
        for block in response.iter_content(1024):
            if not block:
                break
            handle.write(block)

def download_image_from_site_default(uri:str):
    r = requests.get(uri)
    s = BeautifulSoup(r.text, "html.parser")
    part = s.find(id="workimage")
    src = part.find("img")['src']
    pic_url = uri[:uri.rfind("/")] + "/" + src
    saveName = PATH_TO_SAVE +pic_url[pic_url.rfind("/"):]
    # if os.path.exists(saveName):
    #     name, ext = os.path.splitext(saveName)
    #     name += str(uuid.uuid4())
    #     saveName = name + ext
    download_image(pic_url, saveName)

def download_image_from_site_try_2(uri:str):
    r = requests.get(uri)
    s = BeautifulSoup(r.text, "html.parser")
    images = s.find_all("img")
    images = str(images)
    
    start = images.find('src="images/')
    images = images[start+5:]
    end = images.find('"')
    src = images[:end]
    
    pic_url = uri[:uri.rfind("/")] + "/" + src
    saveName = PATH_TO_SAVE +pic_url[pic_url.rfind("/"):]
    # if os.path.exists(saveName):
    #     name, ext = os.path.splitext(saveName)
    #     name += str(uuid.uuid4())
    #     saveName = name + ext
    download_image(pic_url, saveName)

In [None]:
def get_menu_items(uri:str):
    links = []
    r = requests.get(uri)
    s = BeautifulSoup(r.text, "html.parser")
    #  Looking for the table with the classes 'wikitable' and 'sortable'
    table = s.find('table', class_='tablelinks')
    i =0
    if table == None:
        return None

    for row in table.find_all('tr'): 
        columns = row.find_all('td')
        td = columns[0]
        td = str(td)
        start = td.find('href="')
        td = td[start+6:]
        end = td.find('"')
        img_page_link = td[:end]
        if img_page_link != 0: 
            i+=1
            links.append(img_page_link)
    return links

def get_table(uri:str):# Creating list with all tables
    links = []
    r = requests.get(uri)
    s = BeautifulSoup(r.text, "html.parser")

    #  Looking for the table with the classes 'wikitable' and 'sortable'
    table = s.find('table', class_='tablelinks')
    i =0
    if table == None:
        return None

    for row in table.find_all('tr'): 
        columns = row.find_all('td')
        td = columns[1]
        td = str(td)
        start = td.find('href="')
        td = td[start+6:]
        end = td.find('"')
        img_page_link = td[:end]
        if img_page_link != 0: 
            i+=1
            links.append(img_page_link)
        # print(img_page_link)

    return links


## Get images

In [None]:
base_url = "http://www.rembrandtpainting.net/"
start_uri = "http://www.rembrandtpainting.net/complete_catalogue/complete_catalogue.htm"

In [None]:
menu_items = get_menu_items(start_uri)
menu_items

In [None]:
for page in menu_items:
    new_base = "http://www.rembrandtpainting.net/complete_catalogue/"
    uri = new_base+page

    table = get_table(uri)
    check_dir(PATH_TO_SAVE)
    done = []
    for i in table:
        if new_base+i in done:
            pass
        else:
            done.append(new_base+i)
            try:
                download_image_from_site_default(new_base+i)
            except:
                try: 
                    download_image_from_site_try_2(new_base+i)
                except:
                    print(f"error at: {new_base+i}")


In [None]:
for filename in os.listdir(PATH_TO_SAVE):
    try:
        img = Image.open(PATH_TO_SAVE+"/"+filename) # open the image file
        img.verify() # verify that it is, in fact an image
        img.close()
    except (IOError, SyntaxError) as e:
        print('Bad file:', filename) # print out the names of corrupt files
        os.remove(PATH_TO_SAVE+"/"+filename)

In [None]:
print(f"aantal images: {len(os.listdir(PATH_TO_SAVE))}")

# Model testing

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

In [None]:
batch_size = 64
image_shape = (180, 180)
PREPROCESSING_FOLDER = "data/preprocessed"
test_dataset = tf.keras.preprocessing.image_dataset_from_directory(
                f"{PREPROCESSING_FOLDER}/test", image_size=image_shape, batch_size=batch_size, label_mode='categorical'
                )

In [None]:
input_shape = (180, 180, 3)
output_shape = 3

data_augmentation = keras.Sequential([
            layers.RandomFlip("horizontal"),
            layers.RandomRotation(0.1),
            layers.RandomZoom(0.2),
            ])
        
conv_base = keras.applications.vgg19.VGG19(
            weights="imagenet",
            include_top=False
            )

conv_base.trainable = True
for layer in conv_base.layers[:-2]:
    layer.trainable = False


inputs = keras.Input(shape=input_shape)
x = inputs
x = data_augmentation(x) 

x = keras.applications.vgg19.preprocess_input(x)
x = conv_base(x)

x = layers.Flatten()(x)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(output_shape, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss="categorical_crossentropy",optimizer="rmsprop",metrics=["accuracy"])

In [None]:
model.load_weights("painter_baseline.keras")

In [None]:
model.evaluate(test_dataset)

In [None]:
filename = "zeeuws_meisje_piet_mondriaan.jpg"
filename, _ = os.path.splitext(f'static/preprocessed/{filename}')

test_dataset = tf.keras.preprocessing.image_dataset_from_directory(
                f"{PREPROCESSING_FOLDER}/test", image_size=image_shape, batch_size=batch_size, label_mode='categorical'
                )
from PIL import Image
import numpy as np

img = Image.open(filename + ".png")

numpydata = np.asarray(img)
numpydata = np.expand_dims(numpydata, axis = 0)

In [None]:
img = tf.keras.utils.load_img(
    filename + ".png", target_size=(180, 180)
)
img_array = tf.keras.utils.img_to_array(img)
img_array = tf.expand_dims(img_array, 0)

In [None]:
#predictions = model.predict(numpydata)
predictions = model.predict(img_array)

In [None]:
test_dataset.class_names

In [None]:
score = tf.nn.softmax(predictions[0])

class_names = test_dataset.class_names

print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(class_names[np.argmax(score)], 100 * np.max(score))
)

