In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
df_goss_raw = pd.read_csv("df_gosscop.csv")

In [3]:
df_goss_raw.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,text,title,filtered_img2,label,dataset,text_combined,clean_text_combined,idx
0,0,0,0,"Those heels were cute, but they didn't last lo...",Selena Gomez Goes Barefoot On The Street After...,https://web.archive.org/web/20180705092151im_/...,0,1,Selena Gomez Goes Barefoot On The Street After...,Selena Gomez Goes Barefoot On The Street After...,0
1,1,2,2,Play video content TMZ.com\n\nJessica Simpson ...,Jessica Simpson Stumbles Out of Hubby's Birthd...,https://imagez.tmz.com/image/44/16by9/2017/09/...,0,1,Jessica Simpson Stumbles Out of Hubby's Birthd...,Jessica Simpson Stumbles Out of Hubby's Birthd...,2
2,2,3,3,After years of rapping about food (most famous...,Kanye West Is Reportedly Opening a Restaurant ...,https://imgix.bustle.com/wmag/2018/07/18/5b4fb...,0,1,Kanye West Is Reportedly Opening a Restaurant ...,Kanye West Is Reportedly Opening a Restaurant ...,3
3,3,5,5,It’s that time of the week again when the cele...,Is Kanye West Heading to Rehab?,https://web.archive.org/web/20170304210340im_/...,0,1,Is Kanye West Heading to Rehab? It’s that time...,Is Kanye West Heading to Rehab? It’s that time...,5
4,4,6,6,"Kate Middleton and her husband Prince William,...",Pregnant Kate Middleton Hit With Cocaine Bombs...,https://media.radaronline.com/brand-img/JuVUVy...,0,1,Pregnant Kate Middleton Hit With Cocaine Bombs...,Pregnant Kate Middleton Hit With Cocaine Bombs...,6


In [None]:
df_goss_raw.loc[df_goss_raw['Unnamed: 0'] == 6536] #this is the same index used in the text embeddings

In [4]:
df_goss_raw['img_path'] = df_goss_raw['idx'].apply(lambda x:str(x)+'.jpg')
df = df_goss_raw[['Unnamed: 0','img_path','label']]
df

Unnamed: 0.1,Unnamed: 0,img_path,label
0,0,0.jpg,0
1,1,2.jpg,0
2,2,3.jpg,0
3,3,5.jpg,0
4,4,6.jpg,0
...,...,...,...
13021,13269,15478.jpg,1
13022,13270,15479.jpg,1
13023,13271,15480.jpg,1
13024,13272,15481.jpg,1


In [5]:
df = df.rename(columns={'Unnamed: 0': 'idx'})

In [6]:
df

Unnamed: 0,idx,img_path,label
0,0,0.jpg,0
1,1,2.jpg,0
2,2,3.jpg,0
3,3,5.jpg,0
4,4,6.jpg,0
...,...,...,...
13021,13269,15478.jpg,1
13022,13270,15479.jpg,1
13023,13271,15480.jpg,1
13024,13272,15481.jpg,1


In [7]:
ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# Produces test split.
remaining, test = train_test_split(df, test_size=ratio_test, random_state=214)

# Adjusts val ratio, w.r.t. remaining dataset.
ratio_remaining = 1 - ratio_test
ratio_val_adjusted = ratio_val / ratio_remaining

# Produces train and val splits.
train, val, = train_test_split(remaining, test_size=ratio_val_adjusted, random_state=214)

In [8]:
print(train.shape, val.shape, test.shape)

(10420, 3) (1303, 3) (1303, 3)


In [9]:
train.head()

Unnamed: 0,idx,img_path,label
1902,1934,2449.jpg,0
1386,1408,1827.jpg,0
2539,2589,3222.jpg,0
5649,5778,6901.jpg,1
2869,2922,3597.jpg,0


In [10]:
val.head()

Unnamed: 0,idx,img_path,label
190,192,262.jpg,0
5266,5378,6435.jpg,1
11875,12103,14218.jpg,1
11325,11547,13573.jpg,1
7021,7177,8511.jpg,1


In [11]:
train['label'].value_counts()

1    8090
0    2330
Name: label, dtype: int64

In [12]:
val['label'].value_counts()

1    1011
0     292
Name: label, dtype: int64

In [13]:
test['label'].value_counts()

1    1019
0     284
Name: label, dtype: int64

In [14]:
#Load the pretrained model
vgg19_model = VGG19(weights='imagenet', include_top=False, input_shape=(224,224,3))

In [15]:
vgg19_model.summary()

Model: "vgg19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [16]:
def extract_features(batch, model):
    path = 'C:/Users/labca/Documents/Dissertation - Fake News/images/gosscop_img/'+str(batch)
    img = image.load_img(path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    features = model.predict(x)
    return features

In [17]:
train['img_features'] =  train['img_path'].apply(lambda x:extract_features(x,vgg19_model))











































































































































A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['img_features'] =  train['img_path'].apply(lambda x:extract_features(x,vgg19_model))


In [18]:
val['img_features'] =  val['img_path'].apply(lambda x:extract_features(x,vgg19_model))



















A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['img_features'] =  val['img_path'].apply(lambda x:extract_features(x,vgg19_model))


In [19]:
test['img_features'] =  test['img_path'].apply(lambda x:extract_features(x,vgg19_model))



















A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['img_features'] =  test['img_path'].apply(lambda x:extract_features(x,vgg19_model))


In [20]:
df_train = pd.DataFrame(train, columns=['img_features','label','idx'])
df_val = pd.DataFrame(val, columns=['img_features','label','idx'])
df_test = pd.DataFrame(test, columns=['img_features','label','idx'])

In [21]:
df_train.head()

Unnamed: 0,img_features,label,idx
1902,[[[[ 0. 0. 0. 0. ...,0,1934
1386,[[[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ...,0,1408
2539,[[[[ 0. 0. 0. 0. ...,0,2589
5649,[[[[0.00000000e+00 0.00000000e+00 0.00000000e+...,1,5778
2869,[[[[ 0. 0. 0. 0. ...,0,2922


In [22]:
df_train.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_train_vgg19.pkl')

In [23]:
df_val.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_val_vgg19.pkl')

In [24]:
df_test.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_test_vgg19.pkl')