In [58]:
import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from collections import Counter

In [59]:
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

stop = set(stopwords.words("english"))

def lowercase_remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

In [60]:
def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [62]:
txt_data = pd.read_csv('Sentiment Analysis Dataset.csv' , sep='|', names=['col1'])
txt_data = txt_data.col1.str.split(',',  3, expand=True)
txt_data.columns = list(txt_data.iloc[0])
txt_data = txt_data.drop(0)
txt_data.index = np.subtract(txt_data.index, 1)

dataset = pd.concat([txt_data['SentimentText'], txt_data['Sentiment']], axis = 1)
dataset.columns = ['text', 'target']

In [83]:
text = NLP_preprocess(dataset)
text.preprocess_data()
text.df
text.set_tokenizer()
df_padded = text.tokenize_and_pad()

Unnamed: 0,text,target
0,sad apl friend,0
1,missed new moon trailer,0
2,omg already 730,1
3,omgaga im sooo im gunna cry ive dentist since ...,0
4,think mi bf cheating tt,0
...,...,...
1577833,zzzzzz finally night tweeters,1
1577834,zzzzzzz sleep well people,1
1577835,zzzzzzzzzz wait homework,0
1577836,zzzzzzzzzzzzz meh,0


In [84]:
text.set_tokenizer()
df_padded = text.tokenize_and_pad()

In [85]:
print(df_padded)

[[    46 250167    167 ...      0      0      0]
 [   158     24    677 ...      0      0      0]
 [   138    110   3338 ...      0      0      0]
 ...
 [ 27573     67    549 ...      0      0      0]
 [ 51290   1902      0 ...      0      0      0]
 [150320     48      0 ...      0      0      0]]


In [82]:
class NLP_preprocess(object):
# Count unique words
    def __init__(self, df):
        self.df = df
        self.max_length = 20
        self.tokenizer = None
        
    def preprocess_data(self, field = "text"):
        self.df[field] = self.df[field].map(lambda x: remove_URL(x))
        self.df[field] = self.df[field].map(lambda x: remove_html(x))
        self.df[field] = self.df[field].map(lambda x: remove_emoji(x))
        self.df[field] = self.df[field].map(lambda x: remove_punct(x))
        self.df[field] = self.df[field].map(lambda x: lowercase_remove_stopwords(x))
    
    def set_tokenizer(self):
        text = train.text
        counter = counter_word(text)
        num_words = len(counter)
        self.tokenizer = Tokenizer(num_words=num_words)
    
    def tokenize_and_pad(self, df = None, field = "text"):
        if df == None:
            df = self.df[field]
        else:
            df = df[field]
            
        self.tokenizer.fit_on_texts(df)
        
        df_sequences = self.tokenizer.texts_to_sequences(df)
        df_padded = pad_sequences(
            df_sequences, maxlen=self.max_length, padding="post", truncating="post"
        )
        return df_padded

In [75]:
text = train.text
counter = counter_word(text)
num_words = len(counter)

# Max number of words in a sequence
max_length = 20

counter

Counter({'sad': 16493,
         'apl': 1,
         'friend': 6454,
         'missed': 6948,
         'new': 22523,
         'moon': 1310,
         'trailer': 651,
         'omg': 5422,
         'already': 9314,
         '730': 188,
         'omgaga': 1,
         'im': 120675,
         'sooo': 3307,
         'gunna': 713,
         'cry': 2654,
         'ive': 15982,
         'dentist': 685,
         'since': 5969,
         '11': 1073,
         'suposed': 16,
         '2': 22593,
         'get': 52597,
         'crown': 85,
         'put': 5186,
         '30mins': 51,
         'think': 28397,
         'mi': 514,
         'bf': 695,
         'cheating': 139,
         'tt': 295,
         'worry': 1854,
         'much': 23013,
         'juuuuuuuuuuuuuuuuussssst': 1,
         'chillin': 1076,
         'sunny': 2329,
         'work': 34132,
         'tomorrow': 17360,
         'handed': 127,
         'uniform': 95,
         'today': 32872,
         'miss': 23236,
         'hmmmm': 271,
      

In [76]:
train_size = int(train.shape[0] * 0.8)

train_sentences = train.text[:train_size]
train_labels = train.target[:train_size]

test_sentences = train.text[train_size:]
test_labels = train.target[train_size:]

In [77]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)

train_labels = train_labels.astype(float)
test_labels = test_labels.astype(float)

In [78]:
word_index = tokenizer.word_index
word_index

{'im': 1,
 'good': 2,
 'get': 3,
 'like': 4,
 'u': 5,
 'dont': 6,
 'lol': 7,
 'day': 8,
 'love': 9,
 'know': 10,
 'go': 11,
 'cant': 12,
 'going': 13,
 'thanks': 14,
 'got': 15,
 'one': 16,
 'time': 17,
 'work': 18,
 'see': 19,
 'back': 20,
 'well': 21,
 'today': 22,
 'really': 23,
 'think': 24,
 'thats': 25,
 'hope': 26,
 'haha': 27,
 'oh': 28,
 'amp': 29,
 'great': 30,
 'still': 31,
 'ill': 32,
 'much': 33,
 'want': 34,
 '2': 35,
 'night': 36,
 'sorry': 37,
 'new': 38,
 'miss': 39,
 'would': 40,
 'didnt': 41,
 'need': 42,
 'home': 43,
 'yeah': 44,
 'morning': 45,
 'fun': 46,
 'youre': 47,
 'last': 48,
 'twitter': 49,
 'come': 50,
 'right': 51,
 'happy': 52,
 'though': 53,
 'hey': 54,
 'feel': 55,
 'make': 56,
 'wish': 57,
 'bad': 58,
 'tomorrow': 59,
 'nice': 60,
 'better': 61,
 'yes': 62,
 'wait': 63,
 'sad': 64,
 'ive': 65,
 'thank': 66,
 'could': 67,
 'getting': 68,
 'way': 69,
 'gonna': 70,
 'awesome': 71,
 'sleep': 72,
 'even': 73,
 'tonight': 74,
 'soon': 75,
 'sure': 76,
 'say

In [79]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [80]:
from keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(
    train_sequences, maxlen=20, padding="post", truncating="post"
)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [81]:
train_padded

array([[    64, 176202,    174, ...,      0,      0,      0],
       [   147,     38,    869, ...,      0,      0,      0],
       [   184,    106,   3802, ...,      0,      0,      0],
       ...,
       [134396,    489,    166, ...,      0,      0,      0],
       [557395,   2632,     44, ...,      0,      0,      0],
       [557397, 557398, 557399, ...,      0,      0,      0]])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words, 8, input_length=max_length))
model.add(LSTM(16, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))


optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(
    train_padded, train_labels, epochs=20, validation_data=(test_padded, test_labels),
)

In [None]:
model.save('C:/Users/Konst/Desktop/PythonNotebooks')