In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras
from keras.metrics import categorical_accuracy
from keras import layers
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, balanced_accuracy_score

import datetime
import os

from preprocess_functions import preprocess_dataframe

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 300

In [None]:
df = pd.read_csv('../data/train_gr/train_clean.csv')

In [None]:
# remove all non alphabet characters
df['clean'] = (df.clean.str.replace(r'[^a-zA-Z]', ' ', regex=True)
                .str.replace(r'\s+', ' ', regex=True))

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens=10000)
encoder.adapt(df.clean.values)

In [None]:
encoder.vocabulary_size()

In [None]:
X = df.clean.values
y = df.user_suggestion.values

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=encoder.vocabulary_size(),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [None]:
callback_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5, min_delta=0.005)

learning_drop = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.90,
    patience=2,
    verbose=0,
    mode="auto",
    min_delta=0.01,
    cooldown=0,
    min_lr=0,
)


In [None]:
# weights = dict(1/df.outcome.value_counts())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, 
          callbacks=[learning_drop, callback_stop])

In [None]:
y_pred = (model.predict(X_test) > 0.5)
y_pred = np.squeeze(y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)