In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
dataset = pd.read_csv('label_only_numeric_exported_data.csv')
sentences = dataset['StringData'].astype(str).tolist()
labels = dataset['Labels'].tolist()

labels = np.array(labels)

labels = dataset['Labels'].apply(lambda x: 1 if x > 0 else 0).tolist()
labels = np.array(labels)

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded_data = pad_sequences(sequences, padding='post', truncating='post')

In [4]:
vocab_size = len(word_index) + 1
embedding_dim = 16
max_length = padded_data.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [5]:
model.fit(padded_data, labels, epochs=10, validation_split=0.2)

loss, accuracy = model.evaluate(padded_data, labels)
print(f"Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.5272 - loss: 0.6914 - val_accuracy: 0.6329 - val_loss: 0.6365
Epoch 2/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.6755 - loss: 0.6026 - val_accuracy: 0.6353 - val_loss: 0.6235
Epoch 3/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7654 - loss: 0.4794 - val_accuracy: 0.6414 - val_loss: 0.6447
Epoch 4/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8084 - loss: 0.4138 - val_accuracy: 0.6910 - val_loss: 0.6246
Epoch 5/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8336 - loss: 0.3630 - val_accuracy: 0.6053 - val_loss: 0.7879
Epoch 6/10
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8495 - loss: 0.3285 - val_accuracy: 0.6966 - val_loss: 0.6779
Epoch 7/10
[1m1

In [6]:
def predict_user_input(user_sentences):
    user_sequences = tokenizer.texts_to_sequences(user_sentences)
    user_padded = pad_sequences(user_sequences, padding='post', truncating='post', maxlen=max_length)
    predictions = model.predict(user_padded)
    predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
    return predicted_labels

In [7]:
user_input = input("Enter a sentence: ")
user_sentences = [user_input]  # Ensure user input is a list of strings
predicted_labels = predict_user_input(user_sentences)
print(f"Predicted Labels: {predicted_labels}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Predicted Labels: [0]
