In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('C:\\Users\\naman\\Desktop\\NLP+REG\\archive\\emails.csv')

In [4]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [6]:
import re

In [7]:
def extract_body(text):
    parts = text.split("\n\n", 1)
    if len(parts) > 1:
        body = parts[1]
    else:
        body = text
    body = re.split(r"OutPut:", body)[0]
    return body.strip()

In [8]:
def to_lower(text):
    return text.lower()


In [9]:

def remove_urls(text):
    return re.sub(r"http\S+", "", text)


In [10]:

def remove_emails(text):
    return re.sub(r"\S+@\S+", "", text)


In [11]:

def remove_control_chars(text):
    return re.sub(r"[\r\t]+", " ", text)


In [12]:

def normalize_spaces(text):
    return re.sub(r"\s+", " ", text).strip()


In [13]:

def cleaning(text):
    text = to_lower(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_control_chars(text)
    text = normalize_spaces(text)
    return text

In [16]:
def preprocess(df):
    df['content'] = df['message'].apply(lambda x: cleaning(extract_body(x)))
    return df[['content']]


In [17]:
cleaned_df = preprocess(df)
print(cleaned_df.head(5))

                                             content
0                               here is our forecast
1  traveling to have a business meeting takes the...
2                      test successful. way to go!!!
3  randy, can you send me a schedule of the salar...
4                  let's shoot for tuesday at 11:45.


In [18]:
def estimate_urgency(text):
    text = text.lower()

    high_urgency = ["important", "asap", "urgent", "immediately", "priority", "critical", "emergency", "attention","now","selected","holidays","holiday","imp","code","otp","deadline","viva","interview","presentation","reporting","due","final","exam","test","quiz","assessment","evaluation","inspection","audit","checkup","consultation","appointment","warning","alert","escalate","escalation","issue","problem","trouble","failure","outage","downtime","crisis","risk","threat","breach","compromise","incident","accident","urgent meeting","urgent response","urgent action","urgent request","time-sensitive","high priority","dear student","dear applicant","dear candidate","visa","passport","ticket","boarding pass","itinerary","travel","flight","hotel","reservation","booking","trip","journey","vacation","tour","expedition","adventure","exploration"]
    medium_urgency = ["soon", "today", "reminder", "follow up", "approval","invitation","verify","report","urgency","blocked","action required","invite","form","schedule","examination","form","submission","meeting","submit","project","assignment","task","review","feedback","response","reply","check","confirm","confirmation","document","docs","paper","application","apply","registration","register","payment","fee","invoice","bill","subscription","renewal","renew","membership","member","account","profile","update profile","settings","preference","option","choice","selection","survey","poll","questionnaire","research","study","analysis","data collection","data entry","data update","data verification","data validation","dev","restrigate","credential","access","login","signin","sign in","password reset","password change","security","safety","protection","backup","restore","maintenance","upgrade","update available","new version","latest version","feature","functionality","improvement","enhancement"]
    low_urgency = ["subscribe","update","newsletter","weekly","monthly","information","info","summary","digest","news","bulletin","circular","notification","announcement","offer","promotion","sale","discount","deal","event","webinar","workshop","conference","training","session","course","class","lecture","seminar","party","celebration","gathering","meeting","hangout","catch up","social","fun","leisure","hobby","interest","recreation","entertainment","relaxation","vacation planning","holiday planning","trip planning","travel planning","tour planning","itinerary planning","adventure planning","love","friendship","relationship","family","friends","colleagues","team","group","community","networking","socializing","connecting","bonding","engagement","participation","involvement"]

    score = 0.0

    for word in high_urgency:
        score += 1.0 * text.count(word)
    for word in medium_urgency:
        score += 0.6 * text.count(word)
    for word in low_urgency:
        score += 0.3 * text.count(word)

    max_possible_score = 6  # normalization factor
    urgency_score = min(score / max_possible_score, 1.0)

    return urgency_score


In [19]:

# Apply scoring
cleaned_df["urgency_score"] = cleaned_df["content"].apply(estimate_urgency)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["urgency_score"] = cleaned_df["content"].apply(estimate_urgency)


In [20]:
import torch
from sentence_transformers import SentenceTransformer





In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2') 

In [22]:
texts = cleaned_df['content'].tolist()
embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/16169 [00:00<?, ?it/s]

In [23]:
import pickle


In [24]:
np.save('sentence_embeddings.npy', embeddings)


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
y = cleaned_df['urgency_score'].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error


In [29]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

In [31]:
print("R² Score:", round(r2, 3))
print("\nSample Predictions:")
for text, pred in zip(df["content"].iloc[:5], model.predict(embeddings[:5])):
    print(f"Email: {text}\nPredicted Urgency: {round(pred, 3)}\n")

R² Score: 0.645

Sample Predictions:
Email: here is our forecast
Predicted Urgency: -0.02500000037252903

Email: traveling to have a business meeting takes the fun out of the trip. especially if you have to prepare a presentation. i would suggest holding the business plan meetings here then take a trip without any formal business meetings. i would even try and get some honest opinions on whether a trip is even desired or necessary. as far as the business meetings, i think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not. too often the presenter speaks and the others are quiet just waiting for their turn. the meetings might be better if held in a round table discussion format. my suggestion for where to go is austin. play golf and rent a ski boat and jet ski's. flying somewhere takes too much time.
Predicted Urgency: 1.0329999923706055

Email: test successful. way to go!!!
Predicted Urgency: 0.421999990940094


In [32]:
import joblib
joblib.dump(model, "urgency_predictor.pkl")

['urgency_predictor.pkl']