# UCLA model without visual attributes applied to the ITU + UCLA set

This notebook is aimed at running the UCLA original network without visual attributes to the combination of the ITU + UCLA datasets

In [1]:
from protestDB import cursor
import pandas as pd
from lib import analysis_utils as au
from matplotlib import pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Loading up the data

In [2]:
pc = cursor.ProtestCursor()
imgs_UCLA = pc.getLabelledImagesAndNonProtest(source="UCLA", label_source="combined")
imgs_ITU = pc.getLabelledImagesAndNonProtest(source="Luca Rossi - ECB", label_source="combined")

print (len(imgs_UCLA), " UCLA images, both protest and non protest")
print (len(imgs_ITU), " ITU images, both protest and non protest")


40720  UCLA images, both protest and non protest
3726  ITU images, both protest and non protest


In [3]:
print("UCLA has ", len(imgs_UCLA[imgs_UCLA["protest"] == True]), "protest images, ", 
      len(imgs_UCLA[imgs_UCLA["protest"] == False]), "non protest images and  ", len(imgs_UCLA[imgs_UCLA["luca rossi"] == True]), " are from ITU dataset")

print("ITU has ", len(imgs_ITU[imgs_ITU["protest"] == True]), "protest images, ", 
      len(imgs_ITU[imgs_ITU["protest"] == False]), "non protest images and  ", len(imgs_ITU[imgs_ITU["luca rossi"] == True]), " are from ITU dataset")

UCLA has  11646 protest images,  29074 non protest images and   0  are from ITU dataset
ITU has  0 protest images,  3726 non protest images and   3726  are from ITU dataset


### Preprocessing

In [4]:
# assign protest tag to itu images that have labels
indx_ITU_protest = imgs_ITU['label'].notnull()
imgs_ITU.loc[indx_ITU_protest, "protest"] = True

In [5]:
print("UCLA has ", len(imgs_UCLA[imgs_UCLA["protest"] == True]), "protest images, ", 
      len(imgs_UCLA[imgs_UCLA["protest"] == False]), "non protest images and  ", len(imgs_UCLA[imgs_UCLA["luca rossi"] == True]), " are from ITU dataset")

print("ITU has ", len(imgs_ITU[imgs_ITU["protest"] == True]), "protest images, ", 
      len(imgs_ITU[imgs_ITU["protest"] == False]), "non protest images and  ", len(imgs_ITU[imgs_ITU["luca rossi"] == True]), " are from ITU dataset")

UCLA has  11646 protest images,  29074 non protest images and   0  are from ITU dataset
ITU has  1000 protest images,  2726 non protest images and   3726  are from ITU dataset


In [6]:
# combined datasets
df_combined = imgs_UCLA.append(imgs_ITU)
len(df_combined)

44446

### Preprocessing

In [7]:
# rename som
cols = ["name", "label", "protest", "luca rossi"]
imgs = df_combined[cols].copy()
imgs = imgs.rename(columns={"label": "violence"})
imgs

Unnamed: 0_level_0,name,violence,protest,luca rossi
imageHASH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
030d92ca8d1f4b6b,test-00000.jpg,0.217776,True,False
c7c6cecccc6ae6a4,test-00001.jpg,,False,False
94c4c6443cbc9c14,test-00002.jpg,0.127544,True,False
f2f068703a38e869,test-00003.jpg,,False,False
83a2860706c6e5e4,test-00004.jpg,,False,False
1890151c5859c8f8,test-00005.jpg,,False,False
7f5b59cbdc543c2c,test-00006.jpg,,False,False
6264e0c0a0c0a0d0,test-00007.jpg,,False,False
eee6e6e6c68f95b5,test-00008.jpg,,False,False
2f2d252159d9898c,test-00009.jpg,,False,False


In [8]:
# guarantee that all images that have violence labels have protest tags and vice versa
imgs_no_violence_labels = imgs[imgs['violence'].isnull()]
print(len(imgs_no_violence_labels[imgs_no_violence_labels["protest"] == True]))
imgs_violence_label = imgs[imgs['violence'].notnull()]
print(len(imgs_violence_label[imgs_violence_label["protest"] == False]))

0
0


In [9]:
# Remove images with scores higher than cut point
cutpoint = 0.43

# get images with lower cutpoint, and images non protest related
imgs_lower_cutpoint = imgs[imgs["violence"] <= 0.43].copy()
imgs_filtered = imgs_lower_cutpoint.append(imgs[imgs["protest"] == False])

#save index of images without violence score
indx_non_violence = imgs_filtered['violence'].isnull()

# normalize
imgs_filtered = au.minMax(imgs_filtered, 'violence')

# mask values that did not had violence labels
imgs_filtered.loc[indx_non_violence, "violence"] = -1

df = imgs_filtered
print(len(imgs) - len(df), " images were removed")

125  images were removed


In [10]:
print("df has ", len(df[df["protest"] == True]), "protest images, ", 
      len(df[df["protest"] == False]), "non protest images and  ", len(df[df["luca rossi"] == True]), " are from ITU dataset")

df has  12521 protest images,  31800 non protest images and   3609  are from ITU dataset


In [11]:
df

Unnamed: 0_level_0,name,violence,protest,luca rossi
imageHASH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
030d92ca8d1f4b6b,test-00000.jpg,0.508508,True,False
94c4c6443cbc9c14,test-00002.jpg,0.297815,True,False
0b2cf29e92aa294f,test-00010.jpg,0.448857,True,False
160335f46456562e,test-00013.jpg,0.262511,True,False
6a22e2626c464360,test-00020.jpg,0.299641,True,False
7864656be2a2260b,test-00028.jpg,0.408410,True,False
a494c469ac8e8ec4,test-00029.jpg,0.461789,True,False
94d0f7b39ebce000,test-00030.jpg,0.261996,True,False
195acace4e4b65c4,test-00037.jpg,0.454666,True,False
2620a8cad6d292b6,test-00047.jpg,0.357541,True,False


### Divide the data

In [12]:
n_folds = 5
seed = 300

In [13]:
k_folds = au.getKSplitsStratified(df, n_folds, "protest", seed )
for i in k_folds:
    print("kfold has ", len(i[i["protest"] == True]), "protest images, ", 
          len(i[i["protest"] == False]), "non protest images and  ", len(i[i["luca rossi"] == True]), " are from ITU dataset")

kfold has  2504 protest images,  6360 non protest images and   733  are from ITU dataset
kfold has  2504 protest images,  6360 non protest images and   726  are from ITU dataset
kfold has  2504 protest images,  6360 non protest images and   705  are from ITU dataset
kfold has  2504 protest images,  6360 non protest images and   701  are from ITU dataset
kfold has  2505 protest images,  6360 non protest images and   744  are from ITU dataset


In [25]:
# print first fold
k_folds[0]

Unnamed: 0_level_0,name,violence,protest
imageHASH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
87534dcfcecedaea,train-32039.jpg,0.908603,True
f3636173f0e86b4f,f3636173f0e86b4f.jpeg,0.000000,False
eceb49c8cc654d8d,train-29168.jpg,0.820274,True
7070788d950d0cc0,test-03689.jpg,0.000000,False
ac8ce4e44b3678ac,train-20995.jpg,0.000000,False
e2bcb3996fab9346,train-01676.jpg,0.000000,False
4565248c8c9cd8ca,train-09974.jpg,0.770185,True
eb26daf2c32f2646,train-20798.jpg,0.000000,False
945a8592484470e4,train-18306.jpg,0.000000,False
63b0302dad5d4ccd,train-11820.jpg,0.800928,True


In [26]:
validation_df = k_folds[0].copy()
test_df = k_folds[1].copy()
train_df = k_folds[2].copy().append(k_folds[3].copy()).append(k_folds[4].copy())
print("train set has ", len(train_df) , " images. validation has ", len(validation_df), " images and test has ", len(test_df))

train set has  26595  images. validation has  8863  images and test has  8863


batch_size = 32