In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer #stemmer

import re
import string
from bs4 import BeautifulSoup
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings("ignore")  

import sys
import time

from sklearn.feature_extraction.text import CountVectorizer #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer #For TF-IDF
from gensim.models import Word2Vec                          #For Word2Vec

from sklearn.model_selection import train_test_split
from keras.utils import np_utils

###

import itertools
import os

from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from keras.models import Sequential, Model, load_model

from keras.callbacks import ModelCheckpoint, EarlyStopping


from keras.layers import Input, Dense, Activation, Dropout, LSTM, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [0]:
def load_model(file_path):
  model = keras.models.load_model(file_path)
  return model

def predict_class(input_x, model):
  y_probs = model.predict(input_x) 
  y_classes = y_probs.argmax(axis=-1)
  return y_probs

uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

snow = nltk.stem.SnowballStemmer('english')
stops = set(stopwords.words("english"))
def stemAndRemoveStopwords(x):
    # Removing all the stopwords
    filtered_words = [snow.stem(word) for word in x.split() if word not in stops]
    return " ".join(filtered_words)
    
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)    
    return input_txt
  
def preprocess(df):
    df["content"] = np.vectorize(remove_pattern)(df["content"], "@[\w]*")
    df["content"] = df["content"].map(stripTagsAndUris)
    df["content"] = df["content"].map(removePunctuation)
    df["content"] = df["content"].map(stemAndRemoveStopwords)
    return df
  
def insert_text(input_text, dataframe):
    dataframe = dataframe.append({'content' : input_text}, ignore_index=True)
    return dataframe

In [4]:
df = pd.DataFrame(columns=['content', 'sentiment'])
print(df)
df = insert_text("I see trees of green, red roses too", df)
df = insert_text("I see them bloom for me and you", df)
df = insert_text("And I think to myself what a wonderful world", df)
df = insert_text("I see skies of blue and clouds of white", df)
df = insert_text("The bright blessed day, the dark sacred night", df)
df = insert_text("And I think to myself what a wonderful world", df)
print(df)
print()
df = preprocess(df)
print(df)

Empty DataFrame
Columns: [content, sentiment]
Index: []
                                         content sentiment
0            I see trees of green, red roses too       NaN
1                I see them bloom for me and you       NaN
2   And I think to myself what a wonderful world       NaN
3        I see skies of blue and clouds of white       NaN
4  The bright blessed day, the dark sacred night       NaN
5   And I think to myself what a wonderful world       NaN

                            content sentiment
0           see tree green red rose       NaN
1                         see bloom       NaN
2                think wonder world       NaN
3          see sky blue cloud white       NaN
4  bright bless day dark sacr night       NaN
5                think wonder world       NaN


In [35]:
model = load_model("../content/drive/My Drive/Colab Notebooks/models/model-ngram.h5")
print(df['content'][0])
test_text = df['content']
picklefile = open("../content/drive/My Drive/Colab Notebooks/models/tfidf.pickle", 'rb')
tfidfvectorizer = pickle.load(picklefile)
# tokenize = text.Tokenizer(num_words=11231, char_level=False)

x_text = tfidfvectorizer.transform(test_text.values.astype('U'))
  
# print(x_text)
predictions= predict_class(x_text,model)
print(predictions.shape)
print(test_text, "\n", predictions)
print(np.argmax(predictions, axis=1))

see tree green red rose
(6, 4)
0             see tree green red rose
1                           see bloom
2                  think wonder world
3            see sky blue cloud white
4    bright bless day dark sacr night
5                  think wonder world
Name: content, dtype: object 
 [[0.1112662  0.36074355 0.2946531  0.23333721]
 [0.13024132 0.23749384 0.4181508  0.21411401]
 [0.04955224 0.47362596 0.28467306 0.19214877]
 [0.05905165 0.13261427 0.22041218 0.587922  ]
 [0.01125506 0.6135496  0.13459261 0.24060275]
 [0.04955224 0.47362596 0.28467304 0.19214876]]
[1 2 1 3 1 1]
