In [0]:
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

#remove comment on first execution
nltk.download('stopwords') 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:

def pattern_replacement(pattern, tweet, replacement):
    r = re.findall('"' + pattern + '[\w]*"', tweet)
    for i in r:
        tweet = re.sub(i, replacement, tweet)
    return tweet;

def preprocess_data(tweet):
    tweet = pattern_replacement('@', tweet, '')

    #.. Removing
    tweet = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', tweet, flags=re.MULTILINE)

    #...Removing non-alphabets from the tweet
    r = re.findall("[^A-Za-z]", tweet)
    for i in r:
        tweet = tweet.replace(i, " ")

    
    #...Removing preposiitons, conjuctions and pronouns from the tweet
    words = nltk.word_tokenize(tweet)
    tags = nltk.pos_tag(words)
    '''
    PRP	Personal pronoun
    DT	Determiner
    CC	Coordinating conjunction
    IN	Preposition or subordinating conjunction
    PRP$	Possessive pronoun
    VBP	Verb, non-3rd person singular present
    NNP	Proper noun, singular
    VBZ	Verb, 3rd person singular present
    VB	Verb, base form
    MD	Modal
    RB	Adverb
    VBD	Verb, past tense
    WP	Wh-pronoun
    CD	Cardinal number
    WRB	Wh-adverb
    WDT	Wh-determiner
    '''    
    
    #del_tags = ['PRP','DT','CC', 'IN', 'PRP$', 'VBP', 'NNP', 'VBZ', 'VB', 'MD', 'RB', 'VBD', 'WP', 'CD', 'WRB', 'WDT']
    del_tags = ['PRP','DT','CC', 'IN', 'PRP$', 'VBP', 'MD', 'WP', 'CD', 'WRB', 'WDT']

    new_tags = []
    for ord_pair in tags:
        if ord_pair[1] not in del_tags and len(ord_pair[0]) > 3:
            new_tags.append(ord_pair[0])

    #...Removing Stopwords from the tweet
    new_tags = [w for w in new_tags if not w in stop_words]

    tweet = " ".join(new_tags)
    
    return tweet


In [0]:
import io
import json
from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences

with open('saved_models/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

maxlength=25


Using TensorFlow backend.


In [0]:
from tensorflow.keras.models import load_model

loaded_CNN_model = load_model('saved_models/CNN_best_weights.01-0.8165.hdf5')



In [0]:
import pandas as pd

df=pd.read_csv('saved_models/streamedtweets.csv',index_col=None)

for i in range(len(df)):
    print(df.Tweet[i])
    prepro=[]
    prepro.append(preprocess_data(df.Tweet[i]))
    print(prepro)
    seq = tokenizer.texts_to_sequences(prepro)
    print(seq)
    pad_seq = pad_sequences(seq, maxlen=maxlength,padding='post')
    print(pad_seq)
    pred = loaded_CNN_model.predict(pad_seq)
    print(pred)
    if(pred>0.5):
        print('Informative')
    else:
        print('Non Informative')
    print('--------------------------')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['Fadleen Kennedy hearing proposal Ventress death retconned']
[[9649, 1738, 7863, 50]]
[[9649 1738 7863   50    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[0.22085156]]
Non Informative
--------------------------
Shes sooooooo pretty to me.
['Shes sooooooo pretty']
[[4986, 491]]
[[4986  491    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[0.1112962]]
Non Informative
--------------------------
@Bur7on @koltonfn neither of you killed me. lmao kolton griefed my dead body on some school shooter shit
['koltonfn killed lmao kolton griefed dead body school shooter shit']
[[35, 905, 31, 10, 66, 2571, 84]]
[[  35  905   31   10   66 2571   84    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[0.23566873]]
Non Informative
--------------------------
Death 