In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy


In [None]:
%cd /content/drive/MyDrive/Training/DTS_Tensorflow/demo/

/content/drive/MyDrive/Training/DTS_Tensorflow/demo


In [None]:
# !wget https://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
# !unzip glove.twitter.27B.zip

In [None]:
# !rm glove.twitter.27B.zip

In [None]:
df = pd.read_csv('tweets.csv')

In [None]:
df.head(20)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would ha...",0
6,6,ablaze,"London, England",Several houses have been set ablaze in Ngemsib...,1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was ...,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side ...",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent ...,0


In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r' httpsmark ', text)


def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)


def remove_atsymbol(text):
    name = re.compile(r'@\S+')
    return name.sub(r' atsymbol ', text)


def remove_hashtag(text):
    hashtag = re.compile(r'#')
    return hashtag.sub(r' hashtag ', text)


def remove_exclamation(text):
    exclamation = re.compile(r'!')
    return exclamation.sub(r' exclamation ', text)


def remove_question(text):
    question = re.compile(r'?')
    return question.sub(r' question ', text)


def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))


def remove_number(text):
    number = re.compile(r'\d+')
    return number.sub(r' number ', text)


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' emoji ', string)

In [None]:

df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda text: remove_URL(text))
df['text'] = df['text'].apply(lambda text: remove_html(text))
df['text'] = df['text'].apply(lambda text: remove_atsymbol(text))
df['text'] = df['text'].apply(lambda text: remove_hashtag(text))
df['text'] = df['text'].apply(lambda text: remove_exclamation(text))
df['text'] = df['text'].apply(lambda text: remove_punc(text))
df['text'] = df['text'].apply(lambda text: remove_number(text))
df['text'] = df['text'].apply(lambda text: remove_emoji(text))

In [None]:
df.head(20)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,communal violence in bhainsa telangana stones ...,1
1,1,ablaze,,telangana section number has been imposed in...,1
2,2,ablaze,New York City,arsonist sets cars ablaze at dealership https...,1
3,3,ablaze,"Morgantown, WV",arsonist sets cars ablaze at dealership https...,1
4,4,ablaze,,lord jesus your love brings freedom and pardon...,0
5,5,ablaze,OC,if this child was chinese this tweet would hav...,0
6,6,ablaze,"London, England",several houses have been set ablaze in ngemsib...,1
7,7,ablaze,Bharat,asansol a bjp office in salanpur village was s...,1
8,8,ablaze,"Accra, Ghana",national security minister kan dapaahs side ch...,0
9,9,ablaze,Searching,this creature who’s soul is no longer clarent ...,0


In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(df.text,df.target,test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(df.text,df.target, test_size=0.2, random_state=42)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [None]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([X_train, X_test], axis=0))

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
# sequences_val = tokenizer.texts_to_sequences(X_val)

X_train = pad_sequences(sequences_train, maxlen=280, truncating='pre')
X_test = pad_sequences(sequences_test, maxlen=280, truncating='pre')
# X_val = pad_sequences(sequences_val, maxlen=280, truncating='pre')

vocabSize = len(tokenizer.index_word) + 1
print(f"Vocabulary size = {vocabSize}")

Vocabulary size = 24103


In [None]:
#Read GloVE embeddings
path_to_glove_file = 'glove.twitter.27B.200d.txt'
num_tokens = vocabSize 
embedding_dim = 200
hits = 0
misses = 0
embeddings_index = {}

# Read word vectors
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print("Found %s word vectors." % len(embeddings_index))


# Assign word vectors to our dictionary/vocabulary
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 1193514 word vectors.
Converted 16715 words (7387 misses)


In [None]:
# Build neural network architecture
adam = Adam(learning_rate=0.001)

model = Sequential([
    Embedding(vocabSize, 200, weights=[embedding_matrix], trainable=False,input_length=280),
    Bidirectional(LSTM(64,recurrent_dropout=0.4)),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

model.compile(loss=SparseCategoricalCrossentropy(),optimizer=adam,metrics=[SparseCategoricalAccuracy()])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 280, 200)          4820600   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              135680    
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 4,960,474
Trainable params: 139,874
Non-trainable params: 4,820,600
_________________________________________________________________


In [None]:
#Callback


callback = [EarlyStopping(
    monitor='val_sparse_categorical_accuracy',
    patience=5,
    restore_best_weights=True,
    min_delta=0.01
), 
ModelCheckpoint(
    filepath='checkpoint_model/',
    save_weights_only=True,
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)
]

In [None]:
# Fit model
history = model.fit(X_train,y_train,batch_size=512,epochs=20, validation_split=0.2,callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

In [None]:
model.load_weights(f'checkpoint_model')

In [None]:
model.evaluate(X_val, y_val, verbose=1)

In [None]:
sentences = [
            #  'i didnt feel humiliated',
            # 'im grabbing a minute to post i feel greedy wrong',
            # 'im feeling rather rotten so im not very ambitious right now',
             'aircraft crash in airport today',
             'Sky News showing the true disaster of these fires with houses being burnt to the ground, many dry months plus this extreme heat clearly a factor. ',
             "What am I supposed to do in this massive sea? I feel suffocated by this yearning sensation but at times my will feels like it's fading so I push past to survive these disasters but I'm afraid to just end it."
            ]
for sentence in sentences:
    print(sentence)
    sentence = sentence.replace('[^a-zA-Z ]', '')
    print(sentence)
    # sentence = clean(sentence)
    sentence = tokenizer.texts_to_sequences([sentence])
    sentence = pad_sequences(sentence, maxlen=280, truncating='pre')
    res = model.predict(sentence)
    # result = np.argmax(model.predict(sentence), axis=-1)
    # result = le.inverse_transform(np.argmax(model.predict(sentence), axis=-1))[0]
    proba =  np.max(model.predict(sentence))
    print(f"{res} : {proba}\n")