In [None]:
from gensim.models import Word2Vec

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

%tensorflow_version 2.x

In [0]:
my_df = pd.read_csv('processedData/Cleaned_Data.csv',index_col=0)


my_df.head()


my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10864 entries, 0 to 10863
Data columns (total 3 columns):
TweetId    10864 non-null int64
Tweet      10864 non-null object
Label      10864 non-null int64
dtypes: int64(2), object(1)
memory usage: 254.8+ KB


In [0]:
SEED = 2000

X = my_df['Tweet']
Y = my_df['Label']
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, Y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [0]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),(len(x_train[y_train == 0]) / (len(x_train)*1.))*100,(len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),(len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,(len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),(len(x_test[y_test == 0]) / (len(x_test)*1.))*100,(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 10646 entries with 56.96% negative, 43.04% positive
Validation set has total 109 entries with 59.63% negative, 40.37% positive
Test set has total 109 entries with 56.88% negative, 43.12% positive


In [0]:

w2v_sg_model = Word2Vec.load('saved_models/tweets_word2vec_sg.model')
w2v_cbow_model = Word2Vec.load('saved_models/tweets_word2vec_cbow.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
len(w2v_sg_model.wv.vocab.keys())

8538

In [0]:
embeddings_index = {}
for w in w2v_sg_model.wv.vocab.keys():
    embeddings_index[w] = np.append(w2v_sg_model.wv[w],w2v_cbow_model.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Found 8538 word vectors.


In [0]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [0]:
import io
import json
tokenizer_json = tokenizer.to_json()
with io.open('saved_models/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [0]:
len(tokenizer.word_index)

18041

In [0]:
for x in x_train[:5]:
    print(x)

brother crying cause thunder lmao
grill school function grills going pretty much forest fire inbetween Made work
berggruenInst Berggruen Institute member dambisamoyo problems infrastructure ProSyn
electrocute somebody thank
Suspect latest theatre attack psychological issues


In [0]:
sequences[:5]

[[788, 1268, 131, 161, 905],
 [1348, 66, 3676, 4738, 21, 491, 70, 47, 1, 6979, 197, 56],
 [6980, 6981, 3677, 1613, 6982, 1187, 4739, 3678],
 [408, 1269, 492],
 [364, 77, 1759, 20, 3016, 421]]

In [0]:
length = []
for x in x_train:
    length.append(len(x.split()))

In [0]:
#maxlength=max(length)
maxlength=25
print(maxlength)

25


In [0]:
x_train_seq = pad_sequences(sequences, maxlen=maxlength,padding='post')
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (10646, 25)


In [0]:
x_train_seq[:5]

array([[ 788, 1268,  131,  161,  905,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [1348,   66, 3676, 4738,   21,  491,   70,   47,    1, 6979,  197,
          56,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [6980, 6981, 3677, 1613, 6982, 1187, 4739, 3678,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [ 408, 1269,  492,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [ 364,   77, 1759,   20, 3016,  421,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int32)

In [0]:

sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=maxlength)

In [0]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=maxlength)

## **Custom weight embedding**

In [0]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [0]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.25561056  0.19907857 -0.16941679 ... -1.09632814 -0.11889576
   0.56917977]
 [-0.61784023  0.41687176 -0.93566668 ... -0.81025845  0.06349801
   0.09247321]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


 Cnn model

In [0]:
seed = 7

from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

from keras.models import load_model


In [0]:
model_cnn_01 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=maxlength, trainable=True)
model_cnn_01.add(e)
model_cnn_01.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_01.add(GlobalMaxPooling1D())
model_cnn_01.add(Dense(256, activation='relu'))
model_cnn_01.add(Dense(1, activation='sigmoid'))

model_cnn_01.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)

model_cnn_01.summary()

Train on 10646 samples, validate on 109 samples
Epoch 1/5
 - 75s - loss: 0.5105 - acc: 0.7563 - val_loss: 0.4296 - val_acc: 0.8165
Epoch 2/5
 - 74s - loss: 0.3330 - acc: 0.8608 - val_loss: 0.4799 - val_acc: 0.7890
Epoch 3/5
 - 74s - loss: 0.1504 - acc: 0.9456 - val_loss: 0.5897 - val_acc: 0.7156
Epoch 4/5
 - 75s - loss: 0.0941 - acc: 0.9651 - val_loss: 0.6586 - val_acc: 0.7890
Epoch 5/5
 - 74s - loss: 0.0768 - acc: 0.9711 - val_loss: 0.6417 - val_acc: 0.7248
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 200)           20000000  
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 24, 100)           40100     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 100)               0         
_____________________________________________________

In [0]:
model_cnn_01.evaluate(x=x_test_seq, y=y_test)



[0.5865811109542847, 0.7339449563157667]

using bigram and tri fouth grams


In [0]:

tweet_input = Input(shape=(maxlength,), dtype='int32')

tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=maxlength, trainable=True)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 25, 200)      20000000    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 24, 100)      40100       embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 23, 100)      60100       embedding_4[0][0]                
____________________________________________________________________________________________

In [0]:

filepath="saved_models/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])

Train on 10646 samples, validate on 109 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.81651, saving model to gdrive/My Drive/Text_classifier/saved_models/CNN_best_weights.01-0.8165.hdf5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.81651
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.81651
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.81651
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.81651


<keras.callbacks.History at 0x7ff9b80b81d0>

In [0]:
loaded_CNN_model = load_model('saved_models/CNN_best_weights.01-0.8165.hdf5')
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)



[0.398078240385843, 0.8440366988882012]

## **Google vector**

In [0]:
from gensim.models.keyedvectors import KeyedVectors

googlenews = KeyedVectors.load_word2vec_format('processedData/GoogleNews-vectors-negative300.bin', binary=True)

print('---')
len(googlenews.wv.vocab.keys())


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


---


  


3000000

In [0]:
embeddings_index_google = {}


for w in googlenews.wv.vocab.keys():
    embeddings_index_google[w] = googlenews.wv[w]
print('Found %s word vectors.' % len(embeddings_index_google))



  after removing the cwd from sys.path.
  """


Found 3000000 word vectors.


In [0]:

num_words = 100000
embedding_matrix_google = np.zeros((num_words, 300))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector_google = embeddings_index_google.get(word)
    if embedding_vector_google is not None:
        embedding_matrix_google[i] = embedding_vector_google


In [0]:
print(embedding_matrix_google)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.35546875  0.18359375  0.14941406 ...  0.04980469 -0.22265625
   0.00405884]
 [-0.13867188  0.04370117 -0.13085938 ...  0.08251953  0.22949219
   0.05932617]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [0]:

tweet_input_google = Input(shape=(maxlength,), dtype='int32')

tweet_encoder_google = Embedding(100000, 300, weights=[embedding_matrix_google], input_length=maxlength, trainable=True)(tweet_input_google)
bigram_branch_google = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder_google)
bigram_branch_google = GlobalMaxPooling1D()(bigram_branch_google)
trigram_branch_google = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder_google)
trigram_branch_google = GlobalMaxPooling1D()(trigram_branch_google)
fourgram_branch_google = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder_google)
fourgram_branch_google = GlobalMaxPooling1D()(fourgram_branch_google)
merged_google = concatenate([bigram_branch_google, trigram_branch_google, fourgram_branch_google], axis=1)

merged_google = Dense(256, activation='relu')(merged_google)
merged_google = Dropout(0.2)(merged_google)
merged_google = Dense(1)(merged_google)
output_google = Activation('sigmoid')(merged_google)
model_google = Model(inputs=[tweet_input_google], outputs=[output_google])
model_google.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_google.summary()

In [0]:

filepath="saved_models/CNN_Google_news.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model_google.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])



Train on 10646 samples, validate on 109 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.84404, saving model to gdrive/My Drive/Text_classifier/saved_models/CNN_Google_news.01-0.8440.hdf5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.84404
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.84404
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.84404
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.84404


<keras.callbacks.History at 0x7f726ae61da0>

In [0]:
loaded_CNN_model_google = load_model('saved_models/CNN_Google_news.01-0.8440.hdf5')
loaded_CNN_model_google.evaluate(x=x_test_seq, y=y_test)



[0.34185645339685844, 0.8623853216477491]

# **Twitter vector**

In [0]:
from gensim.models.keyedvectors import KeyedVectors

googlenews = KeyedVectors.load_word2vec_format('processedData/glove_twitter_27B_200d.bin')

print('---')
len(googlenews.wv.vocab.keys())


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


---


  


1193514

In [0]:
embeddings_index_twitter = {}


for w in googlenews.wv.vocab.keys():
    embeddings_index_twitter[w] = googlenews.wv[w]
print('Found %s word vectors.' % len(embeddings_index_twitter))



  after removing the cwd from sys.path.
  """


Found 1193514 word vectors.


In [0]:

num_words = 100000
embedding_matrix_twitter = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector_twitter = embeddings_index_twitter.get(word)
    if embedding_vector_twitter is not None:
        embedding_matrix_twitter[i] = embedding_vector_twitter


In [0]:
print(embedding_matrix_twitter)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.48187    -0.29707    -0.77723002 ... -0.96424001  0.065379
   0.20376   ]
 [ 0.18068001  0.37165001 -0.93690002 ...  0.069149   -0.77416003
   0.05999   ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [0]:

tweet_input_twitter = Input(shape=(maxlength,), dtype='int32')

tweet_encoder_twitter = Embedding(100000, 200, weights=[embedding_matrix_twitter], input_length=maxlength, trainable=True)(tweet_input_twitter)
bigram_branch_twitter = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder_twitter)
bigram_branch_twitter = GlobalMaxPooling1D()(bigram_branch_twitter)
trigram_branch_twitter = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder_twitter)
trigram_branch_twitter = GlobalMaxPooling1D()(trigram_branch_twitter)
fourgram_branch_twitter = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder_twitter)
fourgram_branch_twitter = GlobalMaxPooling1D()(fourgram_branch_twitter)
merged_twitter = concatenate([bigram_branch_twitter, trigram_branch_twitter, fourgram_branch_twitter], axis=1)

merged_twitter = Dense(256, activation='relu')(merged_twitter)
merged_twitter = Dropout(0.2)(merged_twitter)
merged_twitter = Dense(1)(merged_twitter)
output_twitter = Activation('sigmoid')(merged_twitter)
model_twitter = Model(inputs=[tweet_input_twitter], outputs=[output_twitter])
model_twitter.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_twitter.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 200)      20000000    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 24, 100)      40100       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 23, 100)      60100       embedding_2[0][0]                
____________________________________________________________________________________________

In [0]:

filepath="saved_models/CNN_glove_twitter.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model_twitter.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])

Train on 10646 samples, validate on 109 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.83486, saving model to gdrive/My Drive/Text_classifier/saved_models/CNN_glove_twitter.01-0.8349.hdf5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.83486
Epoch 3/5

Epoch 00003: val_acc improved from 0.83486 to 0.83486, saving model to gdrive/My Drive/Text_classifier/saved_models/CNN_glove_twitter.03-0.8349.hdf5
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.83486
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.83486


<keras.callbacks.History at 0x7f7279ab95c0>

In [0]:
loaded_CNN_model = load_model('saved_models/CNN_glove_twitter.03-0.8349.hdf5')
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)



[0.46489426466303135, 0.8165137620147215]