In [2]:
import os
import socket
from urllib.request import urlretrieve
import pandas as pd
from tqdm.notebook import tqdm
from PIL import Image
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter

In [3]:
DATASET = 'https://raw.githubusercontent.com/rubenros1795/iconicity/master/data/dataset/WarRoom-sift-corrected.tsv'
SAMPLE_SIZE = 5000
socket.setdefaulttimeout(10)

df = pd.read_csv(DATASET , sep='\t')
sample = df.sample(SAMPLE_SIZE, random_state=42)

In [124]:
if not os.path.exists('images'):
    os.mkdir('images')
    
for row in tqdm(sample.itertuples(),
    total=SAMPLE_SIZE):
    if row.image_url_full != 'na':
        url = row.image_url_full 
    else:
        url = row.image_url_partial 
    file_extension = url[url.rfind('.'):] 
    if '?' in file_extension:
        file_extension = file_extension[:file_extension.find('?' )]
    try:
        urlretrieve(url, "images/{}{}".format(row.Index, file_extension))
    except:
        continue

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=600.0), HTML(value='')))




In [55]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory

In [125]:
IMAGE_SIZE = (224, 224)
train_ds = image_dataset_from_directory(
    '/Users/jaspervogelzang/Documents/ADS Master/Data Mining/Lab Sessions/images/train',  
    shuffle=True,
    image_size= IMAGE_SIZE,
    label_mode='categorical',
    batch_size=64,
    )

test_ds = image_dataset_from_directory(
     '/Users/jaspervogelzang/Documents/ADS Master/Data Mining/Lab Sessions/images/test',
    shuffle=False,
    image_size=IMAGE_SIZE,
    label_mode='categorical',
    batch_size=16
)

val_ds = image_dataset_from_directory(
     '/Users/jaspervogelzang/Documents/ADS Master/Data Mining/Lab Sessions/images/val',
    shuffle=False,
    image_size=IMAGE_SIZE,
    label_mode='categorical',
    batch_size=16
)

Found 303 files belonging to 3 classes.
Found 45 files belonging to 3 classes.
Found 42 files belonging to 3 classes.


In [1]:
#Print the class labels
labels = train_ds.class_names
n_classes = len(labels)
print(labels)

NameError: name 'train_ds' is not defined

Because image_dataset_from_directory returns a generator where the images are pre-processed when we train the model, you can improve the speed of your model by already preprocessing the next batch while training the current batch. You can do this using tf.data.experimental.AUTOTUNE:

In [127]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

In [128]:
from tensorflow.keras.applications.resnet import preprocess_input

IMG_SHAPE = IMAGE_SIZE + (3,)
base_model = tf.keras.applications.ResNet50(input_shape=IMG_SHAPE,
                                           include_top=False,
                                           weights='imagenet')
base_model.trainable = False

We set base_model.trainable = False to disable training the ResNet50 model. We only use the ResNet50 model to get our features, and build a neural network on top of those features:

In [78]:
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

model = tf.keras.Sequential()

model.add(tf.keras.layers.Lambda(preprocess_input, name='preprocessing', input_shape=IMG_SHAPE))
model.add(base_model)
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(1024, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
preprocessing (Lambda)       (None, 224, 224, 3)       0         
_________________________________________________________________
resnet50 (Functional)        (None, 7, 7, 2048)        23587712  
_________________________________________________________________
flatten_4 (Flatten)          (None, 100352)            0         
_________________________________________________________________
dense_8 (Dense)              (None, 1024)              102761472 
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 3075      
Total params: 126,352,259
Trainable params: 102,764,547
Non-trainable params: 23,587,712
_______________________________

# Compile the model
Compling the model is rather similar to BERT. In this step, we can specify the optimizer (how to parameters are optimized), the loss (how we calculate the error) and the learning rate (how fast the model learns). We’ll just do with the vanilla settings. Furthermore, we also specify the EarlyStopping: we evaluate the model each epoch (one iteration through the training set), and when it doesn’t improve it’s performance on the validation set three times in a row, we’ll stop the training early, and return the best model.

In [79]:
callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                         min_delta=0, patience=2, verbose=0, 
                          mode='min', baseline=None, 
                      restore_best_weights=True)]

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')

# Train the model

In [83]:
history = model.fit( 
    train_ds,
    validation_data=val_ds,
    epochs=3, 
   callbacks=callbacks
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Evaluate the performance

In [129]:
predictions = model.predict(test_ds, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_true = np.argmax(np.concatenate([labels.numpy() for images, labels in test_ds.take(-1)]), axis=1)

print(classification_report(y_true, y_pred, 
                            target_names=labels))

              precision    recall  f1-score   support

        meme       1.00      0.57      0.73         7
    original       1.00      1.00      1.00        23
       other       0.83      1.00      0.91        15

    accuracy                           0.93        45
   macro avg       0.94      0.86      0.88        45
weighted avg       0.94      0.93      0.93        45



# Predicting multiple images

In [130]:
def process_image(image_path, single_batch=False):
    img = keras.preprocessing.image.load_img(
        image_path, target_size=IMAGE_SIZE
    )
    img_array = keras.preprocessing.image.img_to_array(img)
    if single_batch:
        img_array = tf.expand_dims(img_array, 0) # Create a batch
    return img_array

IMAGE_SIZE = (224, 224)
directory2predict = '/Users/jaspervogelzang/Documents/ADS Master/Data Mining/Lab Sessions/unlabeled'
processed_images = []
image_paths = []

for image_path in os.scandir(directory2predict):
    try:
        processed_image = process_image(image_path.path)
    except:
        continue
    image_paths.append(image_path.path)
    processed_images.append(processed_image)



In [131]:
processed_images = np.array(processed_images)
data = tf.data.Dataset.from_tensor_slices(processed_images).batch(64)
probabilities = model.predict(data, verbose=1)



In [132]:
image_classes = np.argmax(probabilities, axis=1)
image_labels = [labels[image_class] for image_class in image_classes]
image_labels[:10]

['other',
 'original',
 'original',
 'other',
 'other',
 'original',
 'original',
 'original',
 'original',
 'original']

In [133]:
Counter(image_labels).most_common()

[('original', 2064), ('other', 1060), ('meme', 185)]

As we also saved the paths of the images that we predicted, and these paths are in the exact same order as the image labels, we can get the ID (the index) of an image using the paths, and making in dictionary where the ID is mapped to the corresponding label. Then, we loop over all the indices, and if the index (ID) is in the dictionary, we get the labels, and add None to our dataframe otherwise

In [158]:
get_id = lambda x: int(x[x.rfind('/') + 1:x.find('.')]) # function to retrieve the ID from the paths.
indices = [get_id(path) for path in image_paths]  # 'images/145.jpg' --> 145
indx2label = dict(zip(indices, image_labels)) # make dictionary mapping the indices to label 

labels = []
for index in sample.index:
    if index in indx2label:
        index_label = indx2label[index]
    else:
        index_label = None
    labels.append(index_label)
sample['label'] = labels

In [162]:
sample

Unnamed: 0,filename,photo,page_url,image_url_full,image_url_partial,page_title,iteration,language,labels,scrape_date,correction,label,base_url
15004,D:/react-data/iconic\WarRoom\WarRoom_6\5ada833...,WarRoom,http://www.akdart.com/smoke.html,http://www.akdart.com/images/zebest.png,na,Smoke and mirrors : An atmosphere of disingenu...,6,en=0.9999999989520127,situation room bin laden,2020-06-16 09:34:17,True,original,www.akdart.com
3043,D:/react-data/iconic\WarRoom\WarRoom_2\b6f3916...,WarRoom,https://www.vox.com/2017/4/7/15220544/trump-sy...,na,https://cdn.vox-cdn.com/thumbor/9MwJlD5uYBgeLZ...,Trump brought his economics team to his Syria ...,2,en=0.9999999997154794,osama bin laden raid,2020-06-15 09:21:22,True,original,www.vox.com
5456,D:/react-data/iconic\WarRoom\WarRoom_3\2667e32...,WarRoom,https://nofilmschool.com/tags/interfaces?type=...,https://nofilmschool.com/sites/default/files/s...,na,interfaces | No Film School,3,en=0.16946150595865334,may 2 2011,2020-06-15 14:42:01,True,other,nofilmschool.com
2405,D:/react-data/iconic\WarRoom\WarRoom_2\b44718c...,WarRoom,https://froggybottomblog.com/2014/01/19/decisi...,https://froggybottomblog.files.wordpress.com/2...,na,Décision et politique étrangère à Washington s...,2,fr=1.0,obama raid,2020-06-15 09:18:40,True,original,froggybottomblog.com
13172,D:/react-data/iconic\WarRoom\WarRoom_5\80a5ec9...,WarRoom,https://www.wadaninews24.com/2018/08/14/waxyaa...,https://www.wadaninews24.com/wp-content/upload...,na,Waxyaabaha reebban iyo kuwa banaan ee Qolka Si...,5,sw=0.9956551914793851,obama bin laden death,2020-06-15 18:00:56,True,original,www.wadaninews24.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12834,D:/react-data/iconic\WarRoom\WarRoom_5\6cc1e31...,WarRoom,https://twitter.com/c3educator/media,na,https://pbs.twimg.com/media/EKAQVtbX0Ash9xb.jpg,Media Tweets by Rob Vicario (@c3educator) | Tw...,5,en=0.9999736053279904,situation room osama bin laden,2020-06-15 17:50:03,True,other,twitter.com
14481,D:/react-data/iconic\WarRoom\WarRoom_5\c8c11a5...,WarRoom,https://wasiq1.wordpress.com/2018/05/,na,https://wasiq1.files.wordpress.com/2018/05/228...,<b>May</b> | 2018 | Wasiq1&#39;s Backup Blog,5,fr=0.6400716286587046,may 2 2011,2020-06-15 18:42:32,na,original,wasiq1.wordpress.com
4220,D:/react-data/iconic\WarRoom\WarRoom_3\13549cf...,WarRoom,https://www.cnn.com/2015/05/12/politics/leon-p...,na,https://cdn.cnn.com/cnnnext/dam/assets/1305010...,Leon Panetta denies Seymour Hersh on <b>bin La...,3,it=0.9590388642489416,bin laden situation room,2020-06-15 14:27:02,True,original,www.cnn.com
6245,D:/react-data/iconic\WarRoom\WarRoom_3\3a88c01...,WarRoom,https://www.ne.ch/autorites/DJSC/SCNE/encourag...,x-raw-image:///30c1e9444e9f89c16c50776fb966ac3...,na,Biographie www.lucaforcucci.com - Canton de Ne...,3,ro=0.9612665881690458,situation room obama,2020-06-15 14:58:10,na,,www.ne.ch


In [163]:
#Extract the netloc of the url by using urlsplit
from urllib.parse import urlsplit

base_url = []
for url in sample.page_url:
    base_url.append(urlsplit(url).netloc)

sample['base_url'] = base_url
sample

Unnamed: 0,filename,photo,page_url,image_url_full,image_url_partial,page_title,iteration,language,labels,scrape_date,correction,label,base_url
15004,D:/react-data/iconic\WarRoom\WarRoom_6\5ada833...,WarRoom,http://www.akdart.com/smoke.html,http://www.akdart.com/images/zebest.png,na,Smoke and mirrors : An atmosphere of disingenu...,6,en=0.9999999989520127,situation room bin laden,2020-06-16 09:34:17,True,original,www.akdart.com
3043,D:/react-data/iconic\WarRoom\WarRoom_2\b6f3916...,WarRoom,https://www.vox.com/2017/4/7/15220544/trump-sy...,na,https://cdn.vox-cdn.com/thumbor/9MwJlD5uYBgeLZ...,Trump brought his economics team to his Syria ...,2,en=0.9999999997154794,osama bin laden raid,2020-06-15 09:21:22,True,original,www.vox.com
5456,D:/react-data/iconic\WarRoom\WarRoom_3\2667e32...,WarRoom,https://nofilmschool.com/tags/interfaces?type=...,https://nofilmschool.com/sites/default/files/s...,na,interfaces | No Film School,3,en=0.16946150595865334,may 2 2011,2020-06-15 14:42:01,True,other,nofilmschool.com
2405,D:/react-data/iconic\WarRoom\WarRoom_2\b44718c...,WarRoom,https://froggybottomblog.com/2014/01/19/decisi...,https://froggybottomblog.files.wordpress.com/2...,na,Décision et politique étrangère à Washington s...,2,fr=1.0,obama raid,2020-06-15 09:18:40,True,original,froggybottomblog.com
13172,D:/react-data/iconic\WarRoom\WarRoom_5\80a5ec9...,WarRoom,https://www.wadaninews24.com/2018/08/14/waxyaa...,https://www.wadaninews24.com/wp-content/upload...,na,Waxyaabaha reebban iyo kuwa banaan ee Qolka Si...,5,sw=0.9956551914793851,obama bin laden death,2020-06-15 18:00:56,True,original,www.wadaninews24.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12834,D:/react-data/iconic\WarRoom\WarRoom_5\6cc1e31...,WarRoom,https://twitter.com/c3educator/media,na,https://pbs.twimg.com/media/EKAQVtbX0Ash9xb.jpg,Media Tweets by Rob Vicario (@c3educator) | Tw...,5,en=0.9999736053279904,situation room osama bin laden,2020-06-15 17:50:03,True,other,twitter.com
14481,D:/react-data/iconic\WarRoom\WarRoom_5\c8c11a5...,WarRoom,https://wasiq1.wordpress.com/2018/05/,na,https://wasiq1.files.wordpress.com/2018/05/228...,<b>May</b> | 2018 | Wasiq1&#39;s Backup Blog,5,fr=0.6400716286587046,may 2 2011,2020-06-15 18:42:32,na,original,wasiq1.wordpress.com
4220,D:/react-data/iconic\WarRoom\WarRoom_3\13549cf...,WarRoom,https://www.cnn.com/2015/05/12/politics/leon-p...,na,https://cdn.cnn.com/cnnnext/dam/assets/1305010...,Leon Panetta denies Seymour Hersh on <b>bin La...,3,it=0.9590388642489416,bin laden situation room,2020-06-15 14:27:02,True,original,www.cnn.com
6245,D:/react-data/iconic\WarRoom\WarRoom_3\3a88c01...,WarRoom,https://www.ne.ch/autorites/DJSC/SCNE/encourag...,x-raw-image:///30c1e9444e9f89c16c50776fb966ac3...,na,Biographie www.lucaforcucci.com - Canton de Ne...,3,ro=0.9612665881690458,situation room obama,2020-06-15 14:58:10,na,,www.ne.ch


In [166]:
#Most common websites for memes
Counter(sample[sample.label == 'meme'].base_url).most_common()[:10]

[('me.me', 10),
 ('twitter.com', 9),
 ('www.pinterest.com', 7),
 ('knowyourmeme.com', 6),
 ('astrologymemes.com', 5),
 ('imgur.com', 5),
 ('www.amazon.com', 3),
 ('awwmemes.com', 3),
 ('www.psychologytoday.com', 3),
 ('esmemes.com', 3)]

In [167]:
#Most common websites for originals
Counter(sample[sample.label == 'original'].base_url).most_common()[:10]

[('twitter.com', 85),
 ('knowyourmeme.com', 26),
 ('www.pinterest.com', 26),
 ('www.cnn.com', 23),
 ('www.dailymail.co.uk', 19),
 ('www.youtube.com', 18),
 ('www.bbc.com', 18),
 ('fineartamerica.com', 16),
 ('www.nbcnews.com', 16),
 ('www.cbsnews.com', 15)]

In [168]:
#Most common websites for memes
Counter(sample[sample.label == 'other'].base_url).most_common()[:10]

[('twitter.com', 36),
 ('www.pinterest.com', 28),
 ('fineartamerica.com', 25),
 ('www.youtube.com', 23),
 ('me.me', 14),
 ('knowyourmeme.com', 12),
 ('www.cnn.com', 10),
 ('www.huffpost.com', 9),
 ('awwmemes.com', 9),
 ('www.slideshare.net', 8)]

In [177]:
#Get the language of each label
sample['language_'] = sample['language'].apply(lambda x: x[:2])
language_type_crosstab = pd.crosstab(sample.language_, sample.label).reset_index()
language_type_crosstab['total'] = list(np.sum(language_type_crosstab, axis=1).values)
language_type_crosstab = language_type_crosstab.dropna()

In [178]:
#Normalize the table
labels = ['meme', 'original', 'other']

for label in labels:
    language_type_crosstab[label] = language_type_crosstab[label] / language_type_crosstab['total']
    
language_type_crosstab[language_type_crosstab.total > 50].sort_values('meme', ascending=False)    

label,language_,meme,original,other,total
13,en,0.072464,0.59025,0.337286,1518
36,ko,0.06383,0.691489,0.244681,94
15,es,0.046729,0.728972,0.224299,214
11,de,0.037433,0.727273,0.235294,187
20,fr,0.031646,0.664557,0.303797,158
77,zh,0.024272,0.572816,0.402913,206
18,fa,0.016949,0.423729,0.559322,59
31,it,0.015625,0.71875,0.265625,64
1,ar,0.0,0.545455,0.454545,55
73,vi,0.0,0.764706,0.235294,68


# Labeling by date

NOTE: I DO NOT expect you to perform this analysis yourself – this is only for students interested in how you could do such an analysis
Because the dataset doesn't contain the date the website was published (only the date the website was scraped), we have to get this data ourselves. We can do this using the Python package htmldate, which returns, given a url, the date the website was publised. To speed things up, we use multiprocessing here, but it still rather slow.

In [4]:
from htmldate import find_date

page_url = sample.page_url
find_date(page_url[0])

'2011-01-01'

In [None]:
url_date = []
for url in sample.page_url:
    url_date.append(find_date(url))

sample['url_date'] = url_date
sample

In [None]:
df_dates = sample[~(sample.label.isna()) & ~(sample.date.isna())]
df_dates['year'] = df_dates['date'].apply(lambda x: x[:4])

In [None]:
#Remove all years earlier than 2011 (the year the photo was published)
cross_tabulated = pd.crosstab(df_dates.year, df_dates.label).reset_index()
cross_tabulated = cross_tabulated[cross_tabulated.year > '2010']

In [None]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

plt.bar(cross_tabulated.year, cross_tabulated.original, label='original')
plt.bar(cross_tabulated.year, cross_tabulated.meme, bottom=cross_tabulated.original, label='meme')
plt.bar(cross_tabulated.year, cross_tabulated.other, bottom=cross_tabulated.original + cross_tabulated.meme, label='other' )
plt.legend()