In [3]:
import json

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Find points without a "cat" label
points_without_cat = []
for entry in data:
    if "cats" not in entry or not entry["cats"]:
        points_without_cat.append(entry)

# Print the points without a "cat" label
for entry in points_without_cat:
    print("Point ID:", entry["id"])
    print("Text:", entry["text"])
    print("Entities:", entry["entities"])
    print("Comments:", entry.get("Comments", []))  # Use .get() to handle missing "Comments" key
    print()


In [9]:
import json
import numpy as np
from sklearn.model_selection import train_test_split

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Separate features (email text and entities) and labels (cats) from the data
features = [entry["text"] for entry in data]
entities = [entry["entities"] for entry in data]
labels = [entry["cats"] for entry in data]

# Split the data into training (80%), validation (10%), and test (10%) sets
# First, split the data into training and the remaining data
X_train, X_remaining, y_train, y_remaining, entities_train, entities_remaining = train_test_split(
    features, labels, entities, test_size=0.2, random_state=42, stratify=labels
)

# Then, split the remaining data into validation and test sets (equal split)
X_val, X_test, y_val, y_test, entities_val, entities_test = train_test_split(
    X_remaining, y_remaining, entities_remaining, test_size=0.5, random_state=42, stratify=y_remaining
)

# Now you have the following splits:
# X_train, y_train, entities_train: Training set (80%)
# X_val, y_val, entities_val: Validation set (10%)
# X_test, y_test, entities_test: Test set (10%)

# Optionally, you can save these splits into separate files for future use
# For example:
with open("train.jsonl", "w") as file:
    for text, cats, entities in zip(X_train, y_train, entities_train):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")

with open("val.jsonl", "w") as file:
    for text, cats, entities in zip(X_val, y_val, entities_val):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")

with open("test.jsonl", "w") as file:
    for text, cats, entities in zip(X_test, y_test, entities_test):
        entry = {
            "text": text,
            "cats": cats,
            "entities": entities
        }
        file.write(json.dumps(entry) + "\n")


In [14]:
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Load the data from the "admin.jsonl" file
data = []
with open("admin.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))

# Extract text, entities, and categories from the data
texts = [entry["text"] for entry in data]
entities = [entry["entities"] for entry in data]
categories = [entry["cats"][0] if entry["cats"] else "other" for entry in data]

# Extract labeled entities from the text using spaCy NER
extracted_entities = []
for text, ents in zip(texts, entities):
    doc = nlp(text)
    extracted_entities.append([ent.label_ for ent in doc.ents])

# Combine extracted entities with the original text
combined_texts = [" ".join([text] + entities) for text, entities in zip(texts, extracted_entities)]

# Encode categories into numerical values
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}
labels = [category_mapping[cat] for cat in categories]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_texts, labels, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, target_names=category_mapping.keys()))


                 precision    recall  f1-score   support

 report_request       0.81      0.57      0.67        53
email_list_edit       1.00      0.21      0.35        28
   report_issue       0.46      0.88      0.60        69
  report_cancel       1.00      0.25      0.40        12
          other       0.71      0.39      0.51        38

       accuracy                           0.57       200
      macro avg       0.80      0.46      0.51       200
   weighted avg       0.71      0.57      0.56       200



In [40]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        text = entry["text"]
        entities = entry.get("entities", [])
        category = entry["cats"][0] if entry["cats"] else "other"

        # Extract entities and combine them with the original text
        entities_text = " ".join([text[s:e] for s, e, _ in entities])
        combined_text = text + " " + entities_text

        data.append({"text": combined_text, "category": category})

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Define the category mapping
category_mapping = {
    "report_request": 0,
    "email_list_edit": 1,
    "report_issue": 2,
    "report_cancel": 3,
    "other": 4
}

# Map the categories to integers
df["category"] = df["category"].map(category_mapping)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["category"], test_size=0.2, random_state=42)

# Vectorize the combined texts using TF-IDF or CountVectorizer
vectorizer = TfidfVectorizer()  # You can also use CountVectorizer if you prefer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, target_names=category_mapping.keys()))

# Save the trained model to a file
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model, 'logistic_regression_model.pkl')


                 precision    recall  f1-score   support

 report_request       0.89      0.64      0.75        53
email_list_edit       0.90      0.32      0.47        28
   report_issue       0.54      0.96      0.69        69
  report_cancel       1.00      0.33      0.50        12
          other       0.81      0.55      0.66        38

       accuracy                           0.67       200
      macro avg       0.83      0.56      0.61       200
   weighted avg       0.76      0.67      0.66       200



['logistic_regression_model.pkl']

In [38]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report
import joblib

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        data.append(entry)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Extract the entity labels
def extract_entity_labels(entities):
    return [entity[2] for entity in entities]

df['entity_labels'] = df['entities'].apply(extract_entity_labels)

# Combine the identified entities with the original text
df['combined_text'] = df.apply(lambda row: " ".join([row['text'][start:end] for start, end, _ in row['entities']]), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["entity_labels"], test_size=0.2, random_state=42)

# Train the Entity Classification Model
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

mlb = MultiLabelBinarizer()
y_train_transformed = mlb.fit_transform(y_train)
y_test_transformed = mlb.transform(y_test)

base_classifier = LogisticRegression(max_iter=1000)
multi_label_classifier = MultiOutputClassifier(base_classifier)
multi_label_classifier.fit(X_train_vectorized, y_train_transformed)

# Convert the "cats" field into a flat list of category labels
category_labels = [label for sublist in df["cats"] for label in sublist]

# Train the Email Category Classification Model
category_vectorizer = TfidfVectorizer()
X_train_category_vectorized = category_vectorizer.fit_transform(df["text"])
category_encoder = LabelEncoder()
y_train_category_encoded = category_encoder.fit_transform(category_labels)

category_model = LogisticRegression(max_iter=1000)
category_model.fit(X_train_category_vectorized, y_train_category_encoded)

# Save the trained models to files
joblib.dump(vectorizer, "entity_vectorizer.pkl")
joblib.dump(multi_label_classifier, "entity_classification_model.pkl")
joblib.dump(category_vectorizer, "category_vectorizer.pkl")
joblib.dump(category_encoder, "category_encoder.pkl")
joblib.dump(category_model, "category_classification_model.pkl")

# Sample text to test the model
sample_text = "Hi Mercury, Please send me a list of appraisers that cover the following counties in Kansas. Jackson Johnson Clay Platt Buchanan I need appraiser names, addresses and tier ratings. Let me know if you need more information. Thanks! Tracie Draper (Mortgage) Appraisal Desk, Lead Email: tracie.draper@primelending.com Website: primelending.com Phone: (972) 738-7739 PrimeLending NMLS#: 13649. Equal Housing Lender 18111 Preston Road, Suite 900, Dallas, TX 75252 \t\t \t\t \t\t\t\t ________________________________ PrimeLending, A PlainsCapital Company NMLS # 13649, Equal Housing Lender. "

# Use the entity classification model to predict entities in the sample text
sample_text_vectorized = vectorizer.transform([sample_text])
predicted_labels = multi_label_classifier.predict(sample_text_vectorized)
predicted_entities = mlb.inverse_transform(predicted_labels)

# Use the predicted entities to predict the email category
sample_text_category_vectorized = category_vectorizer.transform([sample_text])
predicted_category_encoded = category_model.predict(sample_text_category_vectorized)
predicted_category_encoded = predicted_category_encoded.reshape(-1, 1)
predicted_category = category_encoder.inverse_transform(predicted_category_encoded)

print("Sample Text:", sample_text)
print("Predicted Entities:", predicted_entities)
print("Predicted Email Category:", predicted_category)

# Classification Report for Entity Classification Model
y_pred_entities = multi_label_classifier.predict(X_test_vectorized)
target_names_entities = mlb.classes_
print("Classification Report for Entity Classification Model:")
print(classification_report(y_test_transformed, y_pred_entities, target_names=target_names_entities))

# Classification Report for Email Category Classification Model
y_pred_category = category_model.predict(X_train_category_vectorized)
target_names_category = category_encoder.classes_
print("Classification Report for Email Category Classification Model:")
print(classification_report(y_train_category_encoded, y_pred_category, target_names=target_names_category))


Sample Text: Hi Mercury, Please send me a list of appraisers that cover the following counties in Kansas. Jackson Johnson Clay Platt Buchanan I need appraiser names, addresses and tier ratings. Let me know if you need more information. Thanks! Tracie Draper (Mortgage) Appraisal Desk, Lead Email: tracie.draper@primelending.com Website: primelending.com Phone: (972) 738-7739 PrimeLending NMLS#: 13649. Equal Housing Lender 18111 Preston Road, Suite 900, Dallas, TX 75252 		 		 				 ________________________________ PrimeLending, A PlainsCapital Company NMLS # 13649, Equal Housing Lender. 
Predicted Entities: [('client',)]
Predicted Email Category: ['report_request']
Classification Report for Entity Classification Model:
                  precision    recall  f1-score   support

      add_emails       0.89      0.35      0.50        23
      add_fields       0.93      0.39      0.55        33
       attention       0.00      0.00      0.00         0
   change_fields       0.00      0.00     

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import json

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        data.append(json.loads(line))

# Create a list to store the extracted entities
entities_data = []

# Extract the entities and their corresponding entity labels
for _, row in enumerate(data):
    text = row["text"]
    entities = row["entities"]
    for start, end, entity_label in entities:
        entity_text = text[start:end]
        entities_data.append({"text": entity_text, "category": entity_label})

# Save the entity data to a separate file (e.g., entities_data.jsonl)
with open("entities_data.jsonl", "w", encoding="utf-8") as output_file:
    for entity in entities_data:
        output_file.write(json.dumps(entity, ensure_ascii=False) + "\n")


In [24]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        data.append(entry)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Extract the entity labels
def extract_entity_labels(entities):
    return [entity[2] for entity in entities]

df['entity_labels'] = df['entities'].apply(extract_entity_labels)

# Combine the identified entities with the original text
df['combined_text'] = df.apply(lambda row: " ".join([row['text'][start:end] for start, end, _ in row['entities']]), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["entity_labels"], test_size=0.2, random_state=42)

# Train the Entity Classification Model
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Use MultiLabelBinarizer to convert the entity labels to binary arrays
binarizer = MultiLabelBinarizer()
y_train_binary = binarizer.fit_transform(y_train)
y_test_binary = binarizer.transform(y_test)

entity_classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
entity_classifier.fit(X_train_vectorized, y_train_binary)

# Save the trained model and vectorizer to separate files with different names
joblib.dump(vectorizer, "entity_vectorizer_admin.pkl")
joblib.dump(entity_classifier, "entity_classification_model_admin.pkl")

# Sample text to test the model
sample_text = "I would like to request a new report"
# Use the entity classification model to predict entities in the sample text
sample_text_vectorized = vectorizer.transform([sample_text])
predicted_labels = entity_classifier.predict(sample_text_vectorized)
predicted_entities = binarizer.inverse_transform(predicted_labels)

# Function to get character ranges for each entity

def get_character_ranges(text, entity):
    ranges = []
    start_idx = 0
    words = text.split()
    for i in range(len(words)):
        word = words[i]
        end_idx = start_idx + len(word)
        if word in entity:
            ranges.append((start_idx, end_idx, word))
        start_idx = end_idx + 1
    return ranges


# Use the predicted entities to get character ranges in the sample text
entities_with_ranges = []
for entity in predicted_entities[0]:
    ranges = get_character_ranges(sample_text, entity)
    entities_with_ranges.append((entity, ranges))

# Print the results
print("Sample Text:", sample_text)
for entity, ranges in entities_with_ranges:
    print("Entity:", entity)
    if ranges:
        for start, end, word in ranges:
            print(f"Character Range: {start}-{end} (Character: {word})")
    else:
        print("Character Range: Not Found")


Sample Text: I would like to request a new report
Entity: create_report
Character Range: 24-25 (Character: a)
Character Range: 30-36 (Character: report)
Entity: issue_to_check
Character Range: 13-15 (Character: to)


In [45]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import joblib

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        data.append(entry)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Extract the entity labels
def extract_entity_labels(entities):
    return [entity[2] for entity in entities]

df['entity_labels'] = df['entities'].apply(extract_entity_labels)

# Combine the identified entities with the original text
df['combined_text'] = df.apply(lambda row: " ".join([row['text'][start:end] for start, end, _ in row['entities']]), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["entity_labels"], test_size=0.2, random_state=42)

# Train the Entity Classification Model
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Use MultiLabelBinarizer to convert the entity labels to binary arrays
binarizer = MultiLabelBinarizer()
y_train_binary = binarizer.fit_transform(y_train)
y_test_binary = binarizer.transform(y_test)

entity_classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
entity_classifier.fit(X_train_vectorized, y_train_binary)

# Save the trained model and vectorizer to separate files with different names
joblib.dump(vectorizer, "entity_vectorizer_admin.pkl")
joblib.dump(entity_classifier, "entity_classification_model_admin.pkl")

# Load the pre-trained email category classification model and vectorizer
category_vectorizer = joblib.load("category_vectorizer.pkl")
category_model = joblib.load("category_classification_model.pkl")
category_encoder = LabelEncoder()

# Sample text to test the model
sample_text = "I woul dlike to request a new report"

# Use the entity classification model to predict entities in the sample text
sample_text_vectorized = vectorizer.transform([sample_text])
predicted_labels = entity_classifier.predict(sample_text_vectorized)
predicted_entities = binarizer.inverse_transform(predicted_labels)

# Function to get character ranges for each entity
def get_character_ranges(text, entity):
    ranges = []
    start_idx = 0
    words = text.split()
    for i in range(len(words)):
        word = words[i]
        end_idx = start_idx + len(word)
        if word in entity:
            ranges.append((start_idx, end_idx, word))
        start_idx = end_idx + 1
    return ranges

# Use the predicted entities to get character ranges in the sample text
entities_with_ranges = []
for entity in predicted_entities[0]:
    ranges = get_character_ranges(sample_text, entity)
    entities_with_ranges.append((entity, ranges))


# Use the predicted entities to predict the email category
sample_text_category_vectorized = category_vectorizer.transform([sample_text])
predicted_category_encoded = category_model.predict(sample_text_category_vectorized)

# Map the numerical category predictions to human-readable text using category_mapping
category_mapping = {
    0: "report_request",
    1: "email_list_edit",
    2: "report_issue",
    3: "report_cancel",
    4: "other"
}
predicted_category_text = [category_mapping[category] for category in predicted_category_encoded]

# Print the results
print("Sample Text:", sample_text)
for entity, ranges in entities_with_ranges:
    print("Entity:", entity)
    if ranges:
        for start, end, word in ranges:
            print(f"Character Range: {start}-{end} (Character: {word})")
    else:
        print("Character Range: Not Found")

print("Predicted Email Category:", predicted_category_text[0])



Sample Text: I woul dlike to request a new report
Entity: create_report
Character Range: 24-25 (Character: a)
Character Range: 30-36 (Character: report)
Predicted Email Category: other
