In [3]:
import json

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Find points without a "cat" label
points_without_cat = []
for entry in data:
    if "cats" not in entry or not entry["cats"]:
        points_without_cat.append(entry)

# Print the points without a "cat" label
for entry in points_without_cat:
    print("Point ID:", entry["id"])
    print("Text:", entry["text"])
    print("Entities:", entry["entities"])
    print("Comments:", entry.get("Comments", []))  # Use .get() to handle missing "Comments" key
    print()


In [9]:
import json
import numpy as np
from sklearn.model_selection import train_test_split

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Separate features (email text and entities) and labels (cats) from the data
features = [entry["text"] for entry in data]
entities = [entry["entities"] for entry in data]
labels = [entry["cats"] for entry in data]

# Split the data into training (80%), validation (10%), and test (10%) sets
# First, split the data into training and the remaining data
X_train, X_remaining, y_train, y_remaining, entities_train, entities_remaining = train_test_split(
    features, labels, entities, test_size=0.2, random_state=42, stratify=labels
)

# Then, split the remaining data into validation and test sets (equal split)
X_val, X_test, y_val, y_test, entities_val, entities_test = train_test_split(
    X_remaining, y_remaining, entities_remaining, test_size=0.5, random_state=42, stratify=y_remaining
)

# Now you have the following splits:
# X_train, y_train, entities_train: Training set (80%)
# X_val, y_val, entities_val: Validation set (10%)
# X_test, y_test, entities_test: Test set (10%)

# Optionally, you can save these splits into separate files for future use
# For example:
with open("train.jsonl", "w") as file:
    for text, cats, entities in zip(X_train, y_train, entities_train):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")

with open("val.jsonl", "w") as file:
    for text, cats, entities in zip(X_val, y_val, entities_val):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")

with open("test.jsonl", "w") as file:
    for text, cats, entities in zip(X_test, y_test, entities_test):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")


In [14]:
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Extract text, entities, and categories from the data
texts = [entry["text"] for entry in data]
entities = [entry["entities"] for entry in data]
categories = [entry["cats"][0] if entry["cats"] else "other" for entry in data]

# Extract labeled entities from the text using spaCy NER
extracted_entities = []
for text, ents in zip(texts, entities):
    doc = nlp(text)
    extracted_entities.append([ent.label_ for ent in doc.ents])

# Combine extracted entities with the original text
combined_texts = [" ".join([text] + entities) for text, entities in zip(texts, extracted_entities)]

# Encode categories into numerical values
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}
labels = [category_mapping[cat] for cat in categories]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_texts, labels, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, target_names=category_mapping.keys()))


                 precision    recall  f1-score   support

 report_request       0.81      0.57      0.67        53
email_list_edit       1.00      0.21      0.35        28
   report_issue       0.46      0.88      0.60        69
  report_cancel       1.00      0.25      0.40        12
          other       0.71      0.39      0.51        38

       accuracy                           0.57       200
      macro avg       0.80      0.46      0.51       200
   weighted avg       0.71      0.57      0.56       200



In [27]:
print(extracted_entities)

[['PERSON', 'DATE', 'ORG', 'DATE', 'TIME', 'DATE', 'TIME', 'DATE', 'ORG', 'PERSON', 'ORG', 'ORG', 'DATE'], ['PERSON', 'ORG', 'ORG', 'DATE', 'PERSON', 'CARDINAL', 'PERSON', 'DATE', 'DATE', 'PERSON', 'PERSON', 'PERSON', 'TIME', 'DATE', 'ORG', 'DATE', 'ORG'], ['PERSON', 'ORG', 'ORG', 'CARDINAL', 'GPE', 'ORG', 'ORG', 'GPE', 'GPE', 'ORG'], ['ORG', 'ORG', 'ORG', 'DATE', 'DATE', 'ORG', 'CARDINAL', 'GPE', 'ORG', 'ORG', 'GPE', 'GPE', 'ORG'], ['PERSON', 'PERSON', 'PRODUCT', 'CARDINAL', 'PERSON', 'PRODUCT', 'PERSON', 'DATE', 'TIME', 'PERSON', 'PERSON', 'ORG', 'PERSON', 'ORG'], ['GPE', 'LAW', 'DATE', 'PERSON', 'PRODUCT', 'CARDINAL', 'PERSON', 'PRODUCT', 'PERSON', 'DATE', 'TIME', 'PERSON', 'PERSON', 'ORG', 'PERSON', 'ORG'], ['TIME', 'ORG', 'PERSON', 'ORG', 'ORG', 'CARDINAL', 'CARDINAL', 'GPE', 'DATE', 'ORG'], ['DATE', 'ORG'], ['PERSON', 'ORG', 'DATE', 'ORG', 'CARDINAL', 'ORG', 'CARDINAL', 'GPE', 'DATE', 'ORG'], ['PERSON', 'ORG', 'GPE', 'ORG', 'CARDINAL', 'CARDINAL', 'PERSON', 'DATE', 'ORDINAL', 'PE

In [24]:
import json
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Rest of the code remains the same

# Extract text, entities, and categories from the data
texts = [entry["text"] for entry in data]
entities = [entry["entities"] for entry in data]
categories = [entry["cats"][0] if entry["cats"] else "other" for entry in data]

# Function to replace characters within the entity spans with the entity label
def replace_entities_with_labels(text, entities):
    entities_map = {}
    for start, end, label in entities:
        entities_map[(start, end)] = label
    text_with_labels = ""
    current_position = 0
    for i, char in enumerate(text):
        if (current_position, i) in entities_map:
            text_with_labels += entities_map[(current_position, i)]
            current_position = i + 1
    text_with_labels += text[current_position:]
    return text_with_labels

# Combine extracted entities with the original text
combined_texts = [replace_entities_with_labels(text, entities_list) for text, entities_list in zip(texts, entities)]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_texts, categories, test_size=0.2, random_state=42)

# Rest of the code remains the same
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}

# Convert categories to numerical values
y_train = np.array([category_mapping[label] for label in y_train])
y_test = np.array([category_mapping[label] for label in y_test])

# Rest of the code remains the same

# Convert texts to sequences using the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
# Pad sequences to have the same length
X_train_padded = pad_sequences(X_train_sequences, padding='post')
X_test_padded = pad_sequences(X_test_sequences, padding='post')
from keras.layers import LSTM
# Rest of the code remains the same
# Calculate the maximum sequence length
max_length = max(len(sequence) for sequence in X_train_padded)

# Create the neural network model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(category_mapping), activation='softmax'))

# Rest of the code remains the same


# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.1)
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10


2023-07-26 11:21:51.252801: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-26 11:21:51.254214: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-26 11:21:51.255463: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-07-26 11:21:58.834077: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-26 11:21:58.835391: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-26 11:21:58.836618: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2023-07-26 11:22:56.460233: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-26 11:22:56.461498: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-26 11:22:56.462770: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Test Loss: 1.4966347217559814
Test Accuracy: 0.3449999988079071


In [40]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        text = entry["text"]
        entities = entry.get("entities", [])
        category = entry["cats"][0] if entry["cats"] else "other"

        # Extract entities and combine them with the original text
        entities_text = " ".join([text[s:e] for s, e, _ in entities])
        combined_text = text + " " + entities_text

        data.append({"text": combined_text, "category": category})

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Define the category mapping
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}

# Map the categories to integers
df["category"] = df["category"].map(category_mapping)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["category"], test_size=0.2, random_state=42)

# Vectorize the combined texts using TF-IDF or CountVectorizer
vectorizer = TfidfVectorizer()  # You can also use CountVectorizer if you prefer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, target_names=category_mapping.keys()))

# Save the trained model to a file
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model, 'logistic_regression_model.pkl')


                 precision    recall  f1-score   support

 report_request       0.89      0.64      0.75        53
email_list_edit       0.90      0.32      0.47        28
   report_issue       0.54      0.96      0.69        69
  report_cancel       1.00      0.33      0.50        12
          other       0.81      0.55      0.66        38

       accuracy                           0.67       200
      macro avg       0.83      0.56      0.61       200
   weighted avg       0.76      0.67      0.66       200



['logistic_regression_model.pkl']

In [49]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the trained model
model = joblib.load('logistic_regression_model.pkl')  # Replace 'logistic_regression_model.pkl' with the actual filename

# Load the category mapping
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}

# Sample text to test the model
sample_text = "thanks"

# Load the vectorizer used during training
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Replace 'tfidf_vectorizer.pkl' with the actual filename of the vectorizer

# Vectorize the sample text using the same vectorizer used during training
sample_text_vectorized = vectorizer.transform([sample_text])

# Make predictions on the sample text
predicted_category = model.predict(sample_text_vectorized)[0]
predicted_category_label = [k for k, v in category_mapping.items() if v == predicted_category][0]

print(f"Predicted Category: {predicted_category_label}")


Predicted Category: other
