In [1]:
%tensorflow_version 2.x

#importing libraries
from keras.utils.vis_utils import plot_model
from keras.models import Model
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Input, Activation
from keras.layers import Flatten, BatchNormalization, Concatenate, add
from keras.layers import Embedding, Dropout, Conv1D, MaxPooling1D, Conv2D
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers.merge import concatenate
import tensorflow as tf
import os


#Loading the data in a dataframe
df = pd.read_csv('drive/My Drive/The_Research/all_data_refined.csv')
df = df.drop(['emotion'], axis = 1)
docs = df['text']

#test
for i in range(len(docs)):
  if docs[i] == "": print(i)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

# pad documents to a max length
wordlen = max(df['word_count'])
max_length = wordlen # Change this if needed
print("Max_Length: ", max_length)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# load the whole embedding into memory
embeddings_index = dict()
f = open('drive/My Drive/glove_data/glove.6B/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()


# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector


#load the labels
type = df['type']
labels = []
for types in type:
  if types == 'real':
    labels.append(1)
  elif types == 'fake':
    labels.append(0)


arr = df[df.columns[8: 23]]

listt = list()
for i in range(len(df)):
  listt.append(i)


X_train_1 = list()
X_train_2 = list()
X_test_1 = list()
X_test_2 = list()


X_train, X_test, y_train, y_test = train_test_split(listt, labels, test_size=0.33)

for i in X_train:
  X_train_1.append(padded_docs[i])
  X_train_2.append(arr.iloc[i])

for i in X_test:
  X_test_1.append(padded_docs[i])
  X_test_2.append(arr.iloc[i])


# define Implicit
inputs1 = Input(shape=(max_length,))
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)(inputs1)
a = (Dropout(0.5))(e)
b = (Conv1D(filters=10, kernel_size=(4)))(a)
c=MaxPooling1D(pool_size=2)(b)
d=Flatten()(c)
f=Dense(128)(d)
g=(BatchNormalization())(f)
z = Activation('relu')
h=(Dropout(0.8))(g)

# define Explicit
inputs2 = Input(shape=(15,))
q=(Dense(128))(inputs2)
r=(BatchNormalization())(q)
u=Activation('relu')(r)

print("h: ", h)
print("u: ", u)

merged = concatenate([h, u])
dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# summarize the model
print("____________________")
print(model.summary())
print("____________________")

plot_model(model, show_shapes=True, to_file='multichannelbeta.png')

# fit the model
print("Fitting")
model.fit([X_train_1, X_train_2], array(y_train), epochs=10, verbose=1, batch_size=16)
print("Fitted")


model_json = model.to_json()
with open("textModel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("textModel.h5")
print("Saved model to disk")


# evaluate the model
print("____________________")
loss, accuracy = model.evaluate([X_test_1, X_test_2] , array(y_test), verbose=1, batch_size=16)
print('Accuracy: %f' % (accuracy*100))

print("____________________")
output = model.predict([X_test_1, X_test_2])
print(output)


def finalOutputWithDelta(delta = 0.0):
  for i in range(len(output)):
    if output[i] >= (0.5 + delta):
      output[i] = 1
    elif output[i] < (0.5 + delta):
      output[i] = 0
  error = 0
  correct = 0
  for i in range(len(output)):
    error = error + ((output[i] - y_test[i]) ** 2)
    if output[i] == y_test[i]: correct+=1

  print("error: ", error)
  print(output)
  print(correct/6605)
finalOutputWithDelta()

Using TensorFlow backend.


Max_Length:  24177
h:  Tensor("dropout_2/cond/Identity:0", shape=(None, 128), dtype=float32)
u:  Tensor("activation_2/Relu:0", shape=(None, 128), dtype=float32)
____________________
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 24177)        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 24177, 100)   16347600    input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 24177, 100)   0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 

In [0]:
# Loading the model:
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os

# load json and create model
json_file = open('textModel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("textModel.h5")
print("Loaded model from disk")

loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics= ['accuracy', 'mse'])
score = loaded_model.evaluate([X_test_1, X_test_2] , array(y_test), verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Loaded model from disk
accuracy: 87.57%


In [12]:
arr.head()

Unnamed: 0,word_count,average_word_count,exclamation_count,capital_count,question_count,negation_count,fpp_count,capital_words,anger,disgust,fear,joy,sadness,surprise,trust
0,1082,12.729412,0,230,2,0,3,0,0.004245,0.002355,0.014836,0.408354,0.033534,0.408217,0.080312
1,344,10.117647,0,67,0,0,0,0,0.0259,0.002831,0.063273,0.306524,0.019827,0.326676,0.10239
2,1090,16.268657,1,189,1,0,2,3,0.019803,0.010692,0.024147,0.580804,0.083064,0.219682,0.040171
3,1302,13.151515,0,283,0,0,3,13,0.010901,0.001291,0.023162,0.688297,0.004354,0.086323,0.117578
4,518,14.0,0,115,0,0,0,0,0.02037,0.002589,0.040738,0.293498,0.016402,0.01618,0.609104


In [52]:
arr.iloc[2]

word_count            1090.000000
average_word_count      16.268657
exclamation_count        1.000000
capital_count          189.000000
question_count           1.000000
negation_count           0.000000
fpp_count                2.000000
capital_words            3.000000
anger                    0.019803
disgust                  0.010692
fear                     0.024147
joy                      0.580804
sadness                  0.083064
surprise                 0.219682
trust                    0.040171
Name: 2, dtype: float64

In [50]:
count = 0
for i in range(len(X_test_2)):
  for j in range(len(X_train_2)):
    if X_test_2[i].name == X_train_2[j].name: 
      print("Test: ", i, "Train: ", j)
    else: 
      count +=1

print(count)

print(X_test == X_train)

100150056
False


In [47]:
X_train_2[0].name

3198

20015

In [0]:
m2 = Model()
m2.load
m2.plot_model(m2, show_shapes=True, to_file='calssifier_news.png')
plot_model(model, show_shapes=True, to_file='multichannelbeta.png')