In [14]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import joblib

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        data.append(entry)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Extract the entity labels
def extract_entity_labels(entities):
    return [entity[2] for entity in entities]

df['entity_labels'] = df['entities'].apply(extract_entity_labels)

# Combine the identified entities with the original text
df['combined_text'] = df.apply(lambda row: " ".join([row['text'][start:end] for start, end, _ in row['entities']]), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["entity_labels"], test_size=0.2, random_state=42)

# Train the Entity Classification Model
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Use MultiLabelBinarizer to convert the entity labels to binary arrays
binarizer = MultiLabelBinarizer()
y_train_binary = binarizer.fit_transform(y_train)
y_test_binary = binarizer.transform(y_test)

entity_classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
entity_classifier.fit(X_train_vectorized, y_train_binary)
# Save the trained model and vectorizer to separate files with different names
joblib.dump(vectorizer, "entity_vectorizer_admin.pkl")
joblib.dump(entity_classifier, "entity_classification_model_admin.pkl")

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["cats"], test_size=0.2, random_state=42)

# Train the Category Classification Model
category_vectorizer = TfidfVectorizer()
X_train_category_vectorized = category_vectorizer.fit_transform(X_train)
X_test_category_vectorized = category_vectorizer.transform(X_test)

# Flatten the lists in the "cats" column and encode the categorical labels
y_train_category_encoded = [item for sublist in y_train for item in sublist]
y_test_category_encoded = [item for sublist in y_test for item in sublist]

category_encoder = LabelEncoder()
y_train_category_encoded = category_encoder.fit_transform(y_train_category_encoded)
y_test_category_encoded = category_encoder.transform(y_test_category_encoded)

category_model = LogisticRegression(max_iter=1000)
category_model.fit(X_train_category_vectorized, y_train_category_encoded)

# Save the trained model and vectorizer to separate files with different names
joblib.dump(category_vectorizer, "category_vectorizer_admin.pkl")
joblib.dump(category_model, "category_classification_model_admin.pkl")








['category_classification_model_admin.pkl']

In [15]:
y_train_pred = entity_classifier.predict(X_train_vectorized)
y_test_pred = entity_classifier.predict(X_test_vectorized)
y_train_pred_labels = binarizer.inverse_transform(y_train_pred)
y_test_pred_labels = binarizer.inverse_transform(y_test_pred)
print("Entity Classification Model Report (Train set):")
print(classification_report(y_train_binary, y_train_pred, target_names=binarizer.classes_))
print("Entity Classification Model Report (Test set):")
print(classification_report(y_test_binary, y_test_pred, target_names=binarizer.classes_))

Entity Classification Model Report (Train set):
                  precision    recall  f1-score   support

      add_emails       1.00      0.44      0.61        91
      add_fields       0.99      0.50      0.67       131
       attention       0.00      0.00      0.00         1
   change_fields       0.00      0.00      0.00         2
          client       0.98      0.91      0.94       374
   create_report       0.99      0.55      0.71       172
   emails_to_add       1.00      0.42      0.59        78
emails_to_remove       0.00      0.00      0.00        23
   fields_to_add       0.98      0.48      0.64        96
fields_to_change       0.00      0.00      0.00         1
fields_to_remove       0.00      0.00      0.00         6
  issue_to_check       0.93      0.96      0.95       426
   remove_emails       1.00      0.08      0.15        49
   remove_fields       0.00      0.00      0.00         9
   remove_report       1.00      0.40      0.58        52
          urgent       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
y_train_category_pred = category_model.predict(X_train_category_vectorized)
print("Category Classification Model Report (Train Set):")
print(classification_report(y_train_category_encoded, y_train_category_pred, target_names=category_encoder.classes_))

# Print classification report for category classification model on the test set
y_test_category_pred = category_model.predict(X_test_category_vectorized)
print("Category Classification Model Report (Test Set):")
print(classification_report(y_test_category_encoded, y_test_category_pred, target_names=category_encoder.classes_))


Category Classification Model Report (Train Set):
                 precision    recall  f1-score   support

email_list_edit       0.99      0.89      0.93        87
          other       0.96      0.92      0.94       156
  report_cancel       1.00      0.75      0.86        52
   report_issue       0.88      0.99      0.93       311
 report_request       0.95      0.91      0.93       194

       accuracy                           0.93       800
      macro avg       0.96      0.89      0.92       800
   weighted avg       0.93      0.93      0.93       800

Category Classification Model Report (Test Set):
                 precision    recall  f1-score   support

email_list_edit       0.95      0.75      0.84        28
          other       0.97      0.87      0.92        38
  report_cancel       1.00      0.67      0.80        12
   report_issue       0.68      0.96      0.80        69
 report_request       0.87      0.64      0.74        53

       accuracy                          

In [18]:
# Sample text to test the model
sample_text = "I would like to request a new report"
# Use the entity classification model to predict entities in the sample text
sample_text_vectorized = vectorizer.transform([sample_text])
predicted_labels = entity_classifier.predict(sample_text_vectorized)
predicted_entities = binarizer.inverse_transform(predicted_labels)

# Function to get character ranges for each entity
def get_character_ranges(text, entity):
    ranges = []
    start_idx = 0
    words = text.split()
    for i in range(len(words)):
        word = words[i]
        end_idx = start_idx + len(word)
        if word in entity:
            ranges.append((start_idx, end_idx, word))
        start_idx = end_idx + 1
    return ranges

# Use the predicted entities to predict the email category
sample_text_category_vectorized = category_vectorizer.transform([sample_text])
predicted_category_encoded = category_model.predict(sample_text_category_vectorized)
# Use LabelEncoder to convert the numerical category predictions to human-readable text
predicted_category_text = category_encoder.inverse_transform(predicted_category_encoded)
# Print the results
print("Sample Text:", sample_text)
# Use the predicted entities to get character ranges in the sample text
entities_with_ranges = []
for entity in predicted_entities[0]:
    ranges = get_character_ranges(sample_text, entity)
    entities_with_ranges.append((entity, ranges))

for entity, ranges in entities_with_ranges:
    print("Entity:", entity)
    if ranges:
        for start, end, word in ranges:
            print(f"Character Range: {start}-{end} (Character: {word})")
    else:
        print("Character Range: Not Found")

print("Predicted Email Category:", predicted_category_text[0])


Sample Text: I would like to request a new report
Entity: create_report
Character Range: 24-25 (Character: a)
Character Range: 30-36 (Character: report)
Entity: issue_to_check
Character Range: 13-15 (Character: to)
Predicted Email Category: report_request
