In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import joblib

# Load the data from admin.jsonl
data = []
with open("admin.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        entry = json.loads(line)
        data.append(entry)

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Extract the entity labels
def extract_entity_labels(entities):
    return [entity[2] for entity in entities]

df['entity_labels'] = df['entities'].apply(extract_entity_labels)

# Combine the identified entities with the original text
df['combined_text'] = df.apply(lambda row: " ".join([row['text'][start:end] for start, end, _ in row['entities']]), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["entity_labels"], test_size=0.2, random_state=42)

# Train the Entity Classification Model
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Use MultiLabelBinarizer to convert the entity labels to binary arrays
binarizer = MultiLabelBinarizer()
y_train_binary = binarizer.fit_transform(y_train)
y_test_binary = binarizer.transform(y_test)

entity_classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))
entity_classifier.fit(X_train_vectorized, y_train_binary)
# Save the trained model and vectorizer to separate files with different names
joblib.dump(vectorizer, "entity_vectorizer_admin.pkl")
joblib.dump(entity_classifier, "entity_classification_model_admin.pkl")

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["cats"], test_size=0.2, random_state=42)

# Train the Category Classification Model
category_vectorizer = TfidfVectorizer()
X_train_category_vectorized = category_vectorizer.fit_transform(X_train)
X_test_category_vectorized = category_vectorizer.transform(X_test)

# Flatten the lists in the "cats" column and encode the categorical labels
y_train_category_encoded = [item for sublist in y_train for item in sublist]
y_test_category_encoded = [item for sublist in y_test for item in sublist]

category_encoder = LabelEncoder()
y_train_category_encoded = category_encoder.fit_transform(y_train_category_encoded)
y_test_category_encoded = category_encoder.transform(y_test_category_encoded)

category_model = LogisticRegression(max_iter=1000)
category_model.fit(X_train_category_vectorized, y_train_category_encoded)

# Save the trained model and vectorizer to separate files with different names
joblib.dump(category_vectorizer, "category_vectorizer_admin.pkl")
joblib.dump(category_model, "category_classification_model_admin.pkl")








['category_classification_model_admin.pkl']

In [3]:
y_train_pred = entity_classifier.predict(X_train_vectorized)
y_test_pred = entity_classifier.predict(X_test_vectorized)
y_train_pred_labels = binarizer.inverse_transform(y_train_pred)
y_test_pred_labels = binarizer.inverse_transform(y_test_pred)
print("Entity Classification Model Report (Train set):")
print(classification_report(y_train_binary, y_train_pred, target_names=binarizer.classes_))
print("Entity Classification Model Report (Test set):")
print(classification_report(y_test_binary, y_test_pred, target_names=binarizer.classes_))

Entity Classification Model Report (Train set):
                  precision    recall  f1-score   support

      add_emails       1.00      0.44      0.61        91
      add_fields       0.98      0.50      0.66       131
       attention       0.00      0.00      0.00         1
   change_fields       0.00      0.00      0.00         2
          client       0.98      0.91      0.94       374
   create_report       0.99      0.56      0.71       172
   emails_to_add       1.00      0.42      0.59        78
emails_to_remove       0.00      0.00      0.00        23
   fields_to_add       0.98      0.48      0.64        96
fields_to_change       0.00      0.00      0.00         1
fields_to_remove       0.00      0.00      0.00         6
  issue_to_check       0.93      0.96      0.95       426
   remove_emails       1.00      0.08      0.15        49
   remove_fields       0.00      0.00      0.00         9
   remove_report       1.00      0.40      0.58        52
          urgent       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
y_train_category_pred = category_model.predict(X_train_category_vectorized)
print("Category Classification Model Report (Train Set):")
print(classification_report(y_train_category_encoded, y_train_category_pred, target_names=category_encoder.classes_))

# Print classification report for category classification model on the test set
y_test_category_pred = category_model.predict(X_test_category_vectorized)
print("Category Classification Model Report (Test Set):")
print(classification_report(y_test_category_encoded, y_test_category_pred, target_names=category_encoder.classes_))


Category Classification Model Report (Train Set):
                 precision    recall  f1-score   support

email_list_edit       0.99      0.89      0.93        87
          other       0.96      0.92      0.94       156
  report_cancel       1.00      0.75      0.86        52
   report_issue       0.88      0.99      0.93       311
 report_request       0.95      0.91      0.93       194

       accuracy                           0.93       800
      macro avg       0.96      0.89      0.92       800
   weighted avg       0.93      0.93      0.93       800

Category Classification Model Report (Test Set):
                 precision    recall  f1-score   support

email_list_edit       0.95      0.75      0.84        28
          other       0.97      0.87      0.92        38
  report_cancel       1.00      0.67      0.80        12
   report_issue       0.69      0.96      0.80        69
 report_request       0.88      0.66      0.75        53

       accuracy                          

In [5]:
import win32com.client
import re
from dateutil import parser

# Sample text to test the model
sample_text = "Hi Alan, May I request a report for all completed transactions in Illinois from May 1, 2022, through April 30, 2023, with the following columns: File number, property address, date completed, client name, product type, vendor first name, vendor last name, vendor license number, vendor license expiration date, date vendor first completed a report with Service 1st. Thanks Regards, Mark Cassidy, Chief Valuation Officer Service 1st, LLC                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     "
# Use the entity classification model to predict entities in the sample text
sample_text_vectorized = vectorizer.transform([sample_text])
predicted_labels = entity_classifier.predict(sample_text_vectorized)
predicted_entities = binarizer.inverse_transform(predicted_labels)

# Function to get character ranges for each entity
def get_character_ranges(text, entity):
    ranges = []
    start_idx = 0
    words = text.split()
    for i in range(len(words)):
        word = words[i]
        end_idx = start_idx + len(word)
        if word in entity:
            ranges.append((start_idx, end_idx, word))
        start_idx = end_idx + 1
    return ranges

# Use the predicted entities to predict the email category
sample_text_category_vectorized = category_vectorizer.transform([sample_text])
predicted_category_encoded = category_model.predict(sample_text_category_vectorized)
# Use LabelEncoder to convert the numerical category predictions to human-readable text
predicted_category_text = category_encoder.inverse_transform(predicted_category_encoded)
# Print the results                                                                                                                                                     
print("Sample Text:", sample_text)
# Use the predicted entities to get character ranges in the sample text
entities_with_ranges = []
for entity in predicted_entities[0]:
    ranges = get_character_ranges(sample_text, entity)
    entities_with_ranges.append((entity, ranges))

for entity, ranges in entities_with_ranges:
    print("Entity:", entity)
    if ranges:
        for start, end, word in ranges:
            print(f"Character Range: {start}-{end} (Character: {word})")
    else:
        print("Character Range: Not Found")

print("Predicted Email Category:", predicted_category_text[0])


Sample Text: Hi Alan, May I request a report for all completed transactions in Illinois from May 1, 2022, through April 30, 2023, with the following columns: File number, property address, date completed, client name, product type, vendor first name, vendor last name, vendor license number, vendor license expiration date, date vendor first completed a report with Service 1st. Thanks Regards, Mark Cassidy, Chief Valuation Officer Service 1st, LLC                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [6]:
import win32com.client

# Account email address
account_email = "kaydenlea@gmail.com"


# Connect to Outlook and retrieve the specified folder
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
inbox = outlook.Folders(account_email).Folders("Inbox")  # Access the inbox folder

# Retrieve the most recent email from the inbox folder
recent_email = inbox.Items.GetLast()

# Print the subject and body of the most recent email
print("Subject:", recent_email.Subject)
print("Body:", recent_email.Body)

Subject: Top Shelf Grind King Maker, 1...
Body:  <https://www.amazon.com/gp/r.html?C=1DBLYEPF3BWHT&K=4W36ZBU0SCEC&M=urn:rtn:msg:20230908160651a0c656670ec74345988c36a83b50p0na&R=39NMATTX3WZ08&T=O&U=https%3A%2F%2Fimages-na.ssl-images-amazon.com%2Fimages%2FG%2F01%2Fnav%2Ftransp.gif&H=XXPW8BKEY1A8PTQL0XIHWKAN5AWA&ref_=pe_2313390_748392930_opens> 
  <https://www.amazon.com/gp/r.html?C=1DBLYEPF3BWHT&K=4W36ZBU0SCEC&M=urn:rtn:msg:20230908160651a0c656670ec74345988c36a83b50p0na&R=NHJOFQME4GFA&T=C&U=https%3A%2F%2Fwww.amazon.com%2F%3Fref_%3Dpe_2313390_748392930&H=IYYYF8LVM5LCGDZCAAKAKRX7KBYA&ref_=pe_2313390_748392930> 	
Hello Kayden Lea, 	
We found something you might like. 	
 <https://www.amazon.com/gp/r.html?C=1DBLYEPF3BWHT&K=4W36ZBU0SCEC&M=urn:rtn:msg:20230908160651a0c656670ec74345988c36a83b50p0na&R=1822F2T9Y8LM1&T=C&U=https%3A%2F%2Fwww.amazon.com%2Fgp%2Fproduct%2FB0BT4YWZM7%2Fref%3Dpe_2313390_748392930_em_1p_0_lm&H=X69PRD6NP3IWEBEZEHFEM3W1D4UA&ref_=pe_2313390_748392930_em_1p_0_lm> 	
Top Shelf 

In [8]:
import win32com.client
import re

# Account email address
account_email = "klea@corelogic.com"

# Connect to Outlook and retrieve the specified folder
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
inbox = outlook.Folders(account_email).Folders("Inbox")  # Access the inbox folder

# Retrieve the most recent email from the inbox folder
recent_email = inbox.Items.GetLast()

# Check if the email passes the filters
subject_patterns = ['Request received', 'Resolved Ticket', 'Automatic Reply', 'Your ticket has been created', 'Undeliverable',
                    'Email Delivery Failure', "Your message couldn't be delivered", 'Out of the Office']
subject_pattern_regex = '|'.join(subject_patterns)
exclude_keywords = ['support@mercuryvmp.kayako.com']
exclude_combinations = ['Ticket', 'Received']

if (
    recent_email.Subject and
    not any(keyword in recent_email.Subject for keyword in subject_patterns) and
    not all(keyword in recent_email.Subject for keyword in exclude_combinations) and
    all(keyword not in recent_email.SenderEmailAddress for keyword in exclude_keywords)
):
    # Modify the email text
    email_text = recent_email.Body  # Assuming you want to modify the email body
    email_text = re.sub('<.*?>', '', email_text)  # Remove HTML tags
    email_text = re.sub('\n', ' ', email_text)  # Replace newline characters with spaces
    email_text = re.sub(' +', ' ', email_text)  # Remove extra spaces
    
    # Save the modified email text to a new file
    with open('modified_email.txt', 'w', encoding='utf-8') as file:
        file.write(email_text)



In [9]:
import win32com.client
import re
import csv
import joblib

# Connect to Outlook and retrieve the default inbox folder
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
inbox = outlook.GetDefaultFolder(6)  # Access the default inbox folder

# Retrieve the most recent email from the inbox folder
recent_email = inbox.Items.GetLast()

# Check if the email passes the filters
subject_patterns = ['Request received', 'Resolved Ticket', 'Automatic Reply', 'Your ticket has been created', 'Undeliverable',
                    'Email Delivery Failure', "Your message couldn't be delivered", 'Out of the Office']
subject_pattern_regex = '|'.join(subject_patterns)
exclude_keywords = ['support@mercuryvmp.kayako.com']
exclude_combinations = ['Ticket', 'Received']

if (
    recent_email.Subject and
    not any(keyword in recent_email.Subject for keyword in subject_patterns) and
    not all(keyword in recent_email.Subject for keyword in exclude_combinations) and
    all(keyword not in recent_email.SenderEmailAddress for keyword in exclude_keywords)
):
    # Modify the email text (if needed)
    email_text = recent_email.Body  # Assuming you want to modify the email body
    email_text = re.sub('<.*?>', '', email_text)  # Remove HTML tags
    email_text = re.sub('\n', ' ', email_text)  # Replace newline characters with spaces
    email_text = re.sub(' +', ' ', email_text)  # Remove extra spaces
    
    # Load the entity classification model and vectorizer
    vectorizer = joblib.load("entity_vectorizer_admin.pkl")
    entity_classifier = joblib.load("entity_classification_model_admin.pkl")
    
    # Predict entities
    email_text_vectorized = vectorizer.transform([email_text])
    predicted_labels = entity_classifier.predict(email_text_vectorized)
    predicted_entities = binarizer.inverse_transform(predicted_labels)
    
    # Load the category classification model and vectorizer
    category_vectorizer = joblib.load("category_vectorizer_admin.pkl")
    category_model = joblib.load("category_classification_model_admin.pkl")
    
    # Predict the category
    email_text_category_vectorized = category_vectorizer.transform([email_text])
    predicted_category_encoded = category_model.predict(email_text_category_vectorized)
    predicted_category_text = category_encoder.inverse_transform(predicted_category_encoded)
    
    # Prepare the data for CSV
    subject = recent_email.Subject
    text = email_text
    predicted_category = predicted_category_text[0]
    
    # Save the results to a CSV file
    with open('email_results.csv', 'a', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow([subject, text, predicted_category])
    
    # Print the results
    print("Email Results Saved to CSV:")
    print("Subject:", subject)
    print("Text:", text)
    print("Predicted Email Category:", predicted_category)
    
else:
    print("Email does not meet the filter criteria.")


Email Results Saved to CSV:
Subject: Kayden, explore these courses on skills you follow
 © 2023 LinkedIn Corporation, 1‌000 West Maude Avenue, Sunnyvale, CA 94085. LinkedIn and the LinkedIn logo are registered trademarks of LinkedIn. 	͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ 
Predicted Email Category: report_issue
