In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras.metrics import categorical_accuracy
from keras import layers
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, balanced_accuracy_score

import datetime
import os

from preprocess_functions import preprocess_dataframe
import nltk
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\iason\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 300

In [None]:
# df = preprocess_dataframe('../data/train_gr/train.csv')

In [None]:
# remove all non alphabet characters
# df['clean'] = (df.clean.str.replace(r'[^a-zA-Z]', ' ', regex=True)
#                 .str.replace(r'\s+', ' ', regex=True))

In [None]:
# df['tokens'] = df.tokens.apply(eval)

In [None]:
# pos_tagged = df.tokens.apply(nltk.pos_tag)

In [None]:
# keep only adjectives, nouns and adverbs
# pos = pos_tagged.apply(lambda tags: [tag[0] for tag in tags if tag[1].startswith(('JJ', 'NN', 'RB'))])


In [None]:
# df['tokens'] = pos

In [None]:
# df['from_tokens'] = df.tokens + df.emoticon

In [None]:
# df['from_tokens'] = df.from_tokens.apply(lambda x: ' '.join(x))

In [None]:
# df['from_tokens'] = (df.from_tokens.str.replace(r'[^a-zA-Z]', ' ', regex=True)
#                 .str.replace(r'\s+', ' ', regex=True))

In [None]:
# df.to_parquet('../data/train_pos.parquet')

In [None]:
df = pd.read_parquet('../data/train_pos.parquet')

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens=20000)
encoder.adapt(df.from_tokens.values)

In [None]:
encoder.vocabulary_size()

In [None]:
df = df[~(df.from_tokens == '')]

In [None]:
df.shape

In [None]:
X = df.from_tokens.values
y = df.user_suggestion.values

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=encoder.vocabulary_size(),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [None]:
callback_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5, min_delta=0.005)

learning_drop = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.80,
    patience=2,
    verbose=0,
    mode="auto",
    min_delta=0.01,
    cooldown=0,
    min_lr=0,
)

# model_dir = "models/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5'

# checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
#     model_dir, monitor='val_loss', verbose=0, save_best_only=True,
#     save_weights_only=False, mode='auto', save_freq='epoch',
#     options=None
# )


In [None]:
# weights = dict(1/df.outcome.value_counts())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, 
          callbacks=[learning_drop, callback_stop])

In [None]:
y_pred = (model.predict(X_test) > 0.5)
y_pred = np.squeeze(y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)