In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
# from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import tensorflow as tf
print(tf.__version__)

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
train = pd.read_csv('../input/dep-nlp/train.csv')
dev = pd.read_csv('../input/dep-nlp/dev.csv')

In [None]:
train.head()

In [None]:
dev.head()

In [None]:
train = train.drop('id', axis=1)
dev = dev.drop('id', axis=1)

In [None]:
frames = [train, dev]
df = pd.concat(frames)

In [None]:
df.head()

In [None]:
df.insert(0, 'id', range(0, len(df)))
df.head()

In [None]:
df.label.unique()

In [None]:
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)
# df['label'] = df.label.replace(label_dict)
# dev_clean['label'] = dev_clean.label.replace(label_dict)


In [None]:
df.info()

In [None]:
df['text']=df['text'].str.lower()

In [None]:
# lemmatization
def getLemmText(text):
 tokens=word_tokenize(text)
 lemmatizer = WordNetLemmatizer()
 tokens=[lemmatizer.lemmatize(word) for word in tokens]
 return ' '.join(tokens)

df['text'] = list(map(getLemmText,df['text']))

In [None]:
# stemming
def getStemmText(text):
    tokens=word_tokenize(text)
    ps = PorterStemmer()
    tokens=[ps.stem(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = list(map(getStemmText,df['text']))

In [None]:
# train test split
xtrain, xtest, ytrain, ytest = train_test_split(
 df['text'], df['label'], 
 test_size=0.33, 
 random_state=53)

print(xtrain.shape)
print(xtest.shape)
print(ytrain)

In [None]:
EMBEDDING_DIMENSION = 64
VOCABULARY_SIZE = 2000
MAX_LENGTH = 100
OOV_TOK = '<OOV>'
TRUNCATE_TYPE = 'post'
PADDING_TYPE = 'post'

In [None]:
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(list(xtrain) + list(xtest))

In [None]:
xtrain_sequences = tokenizer.texts_to_sequences(xtrain)
xtest_sequences = tokenizer.texts_to_sequences(xtest)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))
dict(list(word_index.items())[0:10])

In [None]:
print(xtrain_sequences[100])

In [None]:
xtrain_pad = sequence.pad_sequences(xtrain_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)
xtest_pad = sequence.pad_sequences(xtest_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)
print(len(xtrain_sequences[0]))
print(len(xtrain_pad[0]))
print(xtrain_pad[100])

In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(list(ytrain))
training_label_seq = np.array(label_tokenizer.texts_to_sequences(ytrain))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(ytest))
print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_article(text):
 return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(xtrain_pad[11]))

In [None]:
# with embedding layer

model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     EMBEDDING_DIMENSION))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(EMBEDDING_DIMENSION, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(EMBEDDING_DIMENSION, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(EMBEDDING_DIMENSION, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
# training_label_seq

In [None]:
type(xtrain_pad)
# xtrain_pad = np.asarray(xtrain_pad).astype(np.float32)
# xtest_pad = np.asarray(xtest_pad).astype(np.float32)
# training_label_seq = np.asarray(training_label_seq).astype(np.float32)
# test_label_seq = np.asarray(test_label_seq).astype(np.float32)

In [None]:
training_label_seq = np.array([np.array(xi) for xi in training_label_seq])
test_label_seq = np.array([np.array(yi) for yi in test_label_seq])

In [None]:
xtrain_pad = tf.convert_to_tensor(xtrain_pad, dtype=tf.int64) 
xtest_pad = tf.convert_to_tensor(xtest_pad, dtype=tf.int64) 
training_label_seq = tf.convert_to_tensor(training_label_seq, dtype=tf.int64) 
test_label_seq = tf.convert_to_tensor(test_label_seq, dtype=tf.int64) 

In [None]:
# training_label_seq = np.asarray(training_label_seq).astype(np.float32)
# test_label_seq = np.asarray(test_label_seq).astype(np.float32)

In [None]:
num_epochs = 10
history = model.fit(xtrain_pad, training_label_seq, epochs=num_epochs, validation_data=(xtest_pad, test_label_seq), verbose=2)

In [None]:
embeddings = np.load("stacked.npy")
labels = np.load("labels.npy")
x_train, x_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)

In [None]:
# proposed BiLSTM model
EMBEDDING_DIMENSION = 768

model1 = Sequential()
model1.add(Bidirectional(LSTM(EMBEDDING_DIMENSION)))
model1.add(SpatialDropout1D(0.7))
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.4))
model1.add(Dense(16, activation='relu'))
model1.add(Dropout(0.4))
model1.add(Dense(3))
model1.add(Activation('softmax'))
model1.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(x_train,
                    y_train,
                    epochs=num_epochs, 
                    validation_data=(x_test, y_test),
                    verbose=2)

In [None]:
pred = model1.predict(x_test)
print("Accuracy for stacked embeddings (without balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred, y_test))
print(classification_report(pred, y_test))