In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical
print(tf.__version__)

In [None]:
path = "/content/drive/MyDrive/dep-bin/"

In [None]:
data = pd.read_csv(path+'data/final.csv')
data.head()

In [None]:
data['target'].value_counts()

In [None]:
df = pd.read_csv(path+'clean-data/data-preprocess.csv')
df.head()

In [None]:
df = df[['text', 'label']]
df.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.utils import pad_sequences
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional
%matplotlib inline

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['text'].astype(str), df['label'], test_size=0.3, random_state=42)


# Using only Dense layer

In [None]:
# Defining pre-processing parameters
max_len = 250
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 5000

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

In [None]:
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)


In [None]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

In [None]:
# Define parameter
vocab_size = 5000
embedding_dim = 16
drop_value = 0.2
n_dense = 24
# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(200, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(50, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded,
                    y_train,
                    epochs=num_epochs, 
                    validation_data=(testing_padded, y_test),
                    verbose=2)

In [None]:
train_dense_results = model.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

# LSTM layer

In [None]:
# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model 
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(200, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='sigmoid'))

In [None]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     verbose=2)

# without embedding layer

In [None]:
emb = np.load(path+'emb/mental-bert-emb.npy')
label = np.load(path+'emb/mental-bert-label.npy')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(emb, label, test_size=0.25, random_state=42)

In [None]:
vocab_size = 5000
embedding_dim = 768

In [None]:
model = Sequential()

model.add(LSTM(embedding_dim))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.0001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [None]:
num_epochs = 10
# history = model.fit(train_emb, train_label, epochs=num_epochs, validation_data=(dev_emb, dev_label), verbose=1)


In [None]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)

In [None]:
y_train.shape

In [None]:
X_train = X_train.reshape([-1,1,768])#change value according to shape
X_test = X_test.reshape([-1,1,768])

In [None]:
X_train.shape

In [None]:
model2 = Sequential()
# model.add(Embedding(vocab_size, embedding_dim))
# model.add(SpatialDropout1D(0.7))
model2.add(LSTM(768, dropout=0.7, recurrent_dropout=0.7))
#model.add(SpatialDropout1D(0.7))
#model.add(LSTM(200, dropout=0.7, recurrent_dropout=0.7))
model2.add(Dense(2, activation='sigmoid'))
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
# model.build()
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, epochs=30, batch_size=100, validation_data=(X_test, y_test))

In [None]:
history = model2.fit(X_train, y_train, epochs=30, batch_size=100, validation_data=(X_test, y_test))

# BiLSTM

In [None]:
model1 = Sequential()
# model.add(Embedding(vocab_size, embedding_dim))
# model.add(SpatialDropout1D(0.7))
model1.add(Bidirectional(LSTM(768, dropout=0.7, recurrent_dropout=0.7)))
#model.add(SpatialDropout1D(0.7))
#model.add(LSTM(200, dropout=0.7, recurrent_dropout=0.7))
model1.add(Dense(2, activation='sigmoid'))
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
history = model1.fit(X_train, y_train, epochs=30, batch_size=100, validation_data=(X_test, y_test))

In [None]:
pred = model1.predict(X_test)

In [None]:
pred

In [None]:
idx = np.argmax(pred, axis=-1)
pred = np.zeros( pred.shape )
pred[ np.arange(pred.shape[0]), idx] = 1

In [None]:
pred

In [None]:
print("Accuracy for stacked embeddings (without balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred, y_test))
print(classification_report(pred, y_test))