In [36]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import layers
import time

In [2]:
df=pd.read_csv('data/sample.csv')

In [4]:
def data_cleaning(text_list): 
    stopwords_rem=False
    stopwords_en=stopwords.words('english')
    lemmatizer=WordNetLemmatizer()
    tokenizer=TweetTokenizer()
    reconstructed_list=[]
    for each_text in text_list: 
        lemmatized_tokens=[]
        tokens=tokenizer.tokenize(each_text.lower())
        pos_tags=pos_tag(tokens)
        for each_token, tag in pos_tags: 
            if tag.startswith('NN'): 
                pos='n'
            elif tag.startswith('VB'): 
                pos='v'
            else: 
                pos='a'
            lemmatized_token=lemmatizer.lemmatize(each_token, pos)
            if stopwords_rem: # False 
                if lemmatized_token not in stopwords_en: 
                    lemmatized_tokens.append(lemmatized_token)
            else: 
                lemmatized_tokens.append(lemmatized_token)
        reconstructed_list.append(' '.join(lemmatized_tokens))
    return reconstructed_list

In [33]:
demo_text=['Today is a good day', 'Today is a bad day', 'Today is ok']
demo_text=data_cleaning(demo_text)
tokenizer=Tokenizer()
tokenizer.fit_on_texts(demo_text)
display(tokenizer.word_index)
demo_ary=pad_sequences(tokenizer.texts_to_sequences(demo_text))
# word_index=tokenizer.word_index
# pad_sequences(demo_ary, maxlen=5)
demo_ary

{'today': 1, 'be': 2, 'a': 3, 'day': 4, 'good': 5, 'bad': 6, 'ok': 7}

array([[1, 2, 3, 5, 4],
       [1, 2, 3, 6, 4],
       [0, 0, 1, 2, 7]], dtype=int32)

In [51]:
X=df['text']#.sample(200)
y=df['label']#.sample(200)

X=data_cleaning(X)
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=.9)

tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size=len(tokenizer.word_index)+1
print(f'Vocab Size: {vocab_size}')
X_train=pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=40)
X_test=pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=40)
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

Vocab Size: 1009


In [53]:
current_time=time.time()
model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size,\
                           output_dim=100,\
                           input_length=40))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(2,activation='softmax'))
model.compile(optimizer='adam',\
              loss='categorical_crossentropy',\
              metrics=['accuracy'])

model.fit(X_train,\
          y_train,\
          batch_size=256,\
          epochs=5,\
          validation_data=(X_test,y_test))
print(f'Time to train: {time.time()-current_time}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Time to train: 20.981783866882324


In [32]:
demo_df=pd.DataFrame(demo_ary)
demo_df['text']=demo_text
demo_df

Unnamed: 0,0,1,2,3,4,text
0,1,2,3,5,4,today be a good day
1,1,2,3,6,4,today be a bad day
2,0,0,1,2,7,today be ok


In [15]:
pad_sequences(tokenizer.texts_to_sequences(['Today is a good day']))

array([[1, 3, 5, 4]], dtype=int32)

In [None]:
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)