In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.models import Model
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
df_goss_raw = pd.read_csv("df_gosscop.csv")

In [3]:
df_goss_raw.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,text,title,filtered_img2,label,dataset,text_combined,clean_text_combined,idx
0,0,0,0,"Those heels were cute, but they didn't last lo...",Selena Gomez Goes Barefoot On The Street After...,https://web.archive.org/web/20180705092151im_/...,0,1,Selena Gomez Goes Barefoot On The Street After...,Selena Gomez Goes Barefoot On The Street After...,0
1,1,2,2,Play video content TMZ.com\n\nJessica Simpson ...,Jessica Simpson Stumbles Out of Hubby's Birthd...,https://imagez.tmz.com/image/44/16by9/2017/09/...,0,1,Jessica Simpson Stumbles Out of Hubby's Birthd...,Jessica Simpson Stumbles Out of Hubby's Birthd...,2
2,2,3,3,After years of rapping about food (most famous...,Kanye West Is Reportedly Opening a Restaurant ...,https://imgix.bustle.com/wmag/2018/07/18/5b4fb...,0,1,Kanye West Is Reportedly Opening a Restaurant ...,Kanye West Is Reportedly Opening a Restaurant ...,3
3,3,5,5,It’s that time of the week again when the cele...,Is Kanye West Heading to Rehab?,https://web.archive.org/web/20170304210340im_/...,0,1,Is Kanye West Heading to Rehab? It’s that time...,Is Kanye West Heading to Rehab? It’s that time...,5
4,4,6,6,"Kate Middleton and her husband Prince William,...",Pregnant Kate Middleton Hit With Cocaine Bombs...,https://media.radaronline.com/brand-img/JuVUVy...,0,1,Pregnant Kate Middleton Hit With Cocaine Bombs...,Pregnant Kate Middleton Hit With Cocaine Bombs...,6


In [4]:
df_goss_raw.loc[df_goss_raw['Unnamed: 0'] == 6536] #this is the same index used in the text embeddings

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,text,title,filtered_img2,label,dataset,text_combined,clean_text_combined,idx
6390,6536,7782,7782,"The journalist called the job ""a dream come tr...",Abby Huntsman to join 'The View' as new co-host,https://s.abcnews.com/images/GMA/abby-huntsman...,1,1,Abby Huntsman to join 'The View' as new co-hos...,Abby Huntsman to join 'The View' as new co-hos...,7782


In [5]:
df_goss_raw['img_path'] = df_goss_raw['idx'].apply(lambda x:str(x)+'.jpg')
df = df_goss_raw[['Unnamed: 0','img_path','label']]
df

Unnamed: 0.1,Unnamed: 0,img_path,label
0,0,0.jpg,0
1,1,2.jpg,0
2,2,3.jpg,0
3,3,5.jpg,0
4,4,6.jpg,0
...,...,...,...
13021,13269,15478.jpg,1
13022,13270,15479.jpg,1
13023,13271,15480.jpg,1
13024,13272,15481.jpg,1


In [6]:
df = df.rename(columns={'Unnamed: 0': 'idx'})

In [7]:
df

Unnamed: 0,idx,img_path,label
0,0,0.jpg,0
1,1,2.jpg,0
2,2,3.jpg,0
3,3,5.jpg,0
4,4,6.jpg,0
...,...,...,...
13021,13269,15478.jpg,1
13022,13270,15479.jpg,1
13023,13271,15480.jpg,1
13024,13272,15481.jpg,1


In [10]:
ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# Produces test split.
remaining, test = train_test_split(df, test_size=ratio_test, random_state=214)

# Adjusts val ratio, w.r.t. remaining dataset.
ratio_remaining = 1 - ratio_test
ratio_val_adjusted = ratio_val / ratio_remaining

# Produces train and val splits.
train, val, = train_test_split(remaining, test_size=ratio_val_adjusted, random_state=214)

In [11]:
print(train.shape, val.shape, test.shape)

(10420, 3) (1303, 3) (1303, 3)


In [12]:
train.head()

Unnamed: 0,idx,img_path,label
1902,1934,2449.jpg,0
1386,1408,1827.jpg,0
2539,2589,3222.jpg,0
5649,5778,6901.jpg,1
2869,2922,3597.jpg,0


In [13]:
val.head()

Unnamed: 0,idx,img_path,label
190,192,262.jpg,0
5266,5378,6435.jpg,1
11875,12103,14218.jpg,1
11325,11547,13573.jpg,1
7021,7177,8511.jpg,1


In [14]:
train['label'].value_counts()

1    8090
0    2330
Name: label, dtype: int64

In [15]:
val['label'].value_counts()

1    1011
0     292
Name: label, dtype: int64

In [16]:
test['label'].value_counts()

1    1019
0     284
Name: label, dtype: int64

In [17]:
#Load the pretrained model
xception_model = Xception(weights='imagenet')
model = Model(inputs=xception_model.input, outputs=xception_model.get_layer(xception_model.layers[-2].name).output)

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, 149, 149, 32  128        ['block1_conv1[0][0]']           
 ation)                         )                                                             

 block4_sepconv1_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block4_sepconv2_act (Activatio  (None, 37, 37, 728)  0          ['block4_sepconv1_bn[0][0]']     
 n)                                                                                               
                                                                                                  
 block4_sepconv2 (SeparableConv  (None, 37, 37, 728)  536536     ['block4_sepconv2_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block4_sepconv2_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv2[0][0]']        
 lization)

 n)                                                                                               
                                                                                                  
 block7_sepconv1 (SeparableConv  (None, 19, 19, 728)  536536     ['block7_sepconv1_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block7_sepconv1_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block7_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block7_sepconv2_act (Activatio  (None, 19, 19, 728)  0          ['block7_sepconv1_bn[0][0]']     
 n)                                                                                               
          

 block9_sepconv3_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block9_sepconv3[0][0]']        
 lization)                                                                                        
                                                                                                  
 add_7 (Add)                    (None, 19, 19, 728)  0           ['block9_sepconv3_bn[0][0]',     
                                                                  'add_6[0][0]']                  
                                                                                                  
 block10_sepconv1_act (Activati  (None, 19, 19, 728)  0          ['add_7[0][0]']                  
 on)                                                                                              
                                                                                                  
 block10_sepconv1 (SeparableCon  (None, 19, 19, 728)  536536     ['block10_sepconv1_act[0][0]']   
 v2D)     

                                                                                                  
 block12_sepconv3_act (Activati  (None, 19, 19, 728)  0          ['block12_sepconv2_bn[0][0]']    
 on)                                                                                              
                                                                                                  
 block12_sepconv3 (SeparableCon  (None, 19, 19, 728)  536536     ['block12_sepconv3_act[0][0]']   
 v2D)                                                                                             
                                                                                                  
 block12_sepconv3_bn (BatchNorm  (None, 19, 19, 728)  2912       ['block12_sepconv3[0][0]']       
 alization)                                                                                       
                                                                                                  
 add_10 (A

In [19]:
def extract_features(batch, model):
    path = 'C:/Users/labca/Documents/Dissertation - Fake News/images/gosscop_img/'+str(batch)
    img = image.load_img(path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    features = model.predict(x)
    return features

In [20]:
train['img_features'] =  train['img_path'].apply(lambda x:extract_features(x,model))













































































A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['img_features'] =  train['img_path'].apply(lambda x:extract_features(x,model))


In [21]:
val['img_features'] =  val['img_path'].apply(lambda x:extract_features(x,model))











A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['img_features'] =  val['img_path'].apply(lambda x:extract_features(x,model))


In [22]:
test['img_features'] =  test['img_path'].apply(lambda x:extract_features(x,model))













A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['img_features'] =  test['img_path'].apply(lambda x:extract_features(x,model))


In [23]:
df_train = pd.DataFrame(train, columns=['img_features','label','idx'])
df_val = pd.DataFrame(val, columns=['img_features','label','idx'])
df_test = pd.DataFrame(test, columns=['img_features','label','idx'])

In [24]:
df_train.head()

Unnamed: 0,img_features,label,idx
1902,"[[0.06194809, 0.057009038, 0.0064003034, 0.008...",0,1934
1386,"[[0.15035054, 0.16874047, 0.12524872, 0.003353...",0,1408
2539,"[[0.14565009, 0.09997965, 0.5346557, 0.0057487...",0,2589
5649,"[[0.32312977, 0.29112035, 0.20511524, 0.093824...",1,5778
2869,"[[0.0056584068, 0.05872748, 0.00397057, 0.2490...",0,2922


In [25]:
df_train_1= pd.DataFrame(train, columns=['img_features','label','idx'])
df_test_1 = pd.DataFrame(test, columns=['img_features','label','idx'])

In [26]:
df_train_1.head()

Unnamed: 0,img_features,label,idx
1902,"[[0.06194809, 0.057009038, 0.0064003034, 0.008...",0,1934
1386,"[[0.15035054, 0.16874047, 0.12524872, 0.003353...",0,1408
2539,"[[0.14565009, 0.09997965, 0.5346557, 0.0057487...",0,2589
5649,"[[0.32312977, 0.29112035, 0.20511524, 0.093824...",1,5778
2869,"[[0.0056584068, 0.05872748, 0.00397057, 0.2490...",0,2922


In [27]:
df_train.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_train_xception.pkl')

In [28]:
df_val.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_val_xception.pkl')

In [29]:
df_test.to_pickle('C:/Users/labca/Documents/Dissertation - Fake News/Embeddings/df_test_xception.pkl')

In [31]:
df_train = pd.DataFrame(train, columns=['img_features','label','idx','img_path'])
df_val = pd.DataFrame(val, columns=['img_features','label','idx','img_path'])
df_test = pd.DataFrame(test, columns=['img_features','label','idx','img_path'])

In [32]:
df_train.head()

Unnamed: 0,img_features,label,idx,img_path
1902,"[[0.06194809, 0.057009038, 0.0064003034, 0.008...",0,1934,2449.jpg
1386,"[[0.15035054, 0.16874047, 0.12524872, 0.003353...",0,1408,1827.jpg
2539,"[[0.14565009, 0.09997965, 0.5346557, 0.0057487...",0,2589,3222.jpg
5649,"[[0.32312977, 0.29112035, 0.20511524, 0.093824...",1,5778,6901.jpg
2869,"[[0.0056584068, 0.05872748, 0.00397057, 0.2490...",0,2922,3597.jpg


In [33]:
df_train.to_csv('C:/Users/labca/Documents/Dissertation - Fake News/df_train_base.csv')
df_val.to_csv('C:/Users/labca/Documents/Dissertation - Fake News/df_val_base.csv')
df_test.to_csv('C:/Users/labca/Documents/Dissertation - Fake News/df_test_base.csv')