In [69]:
import pandas as pd
from nltk.corpus import stopwords  # Uncomment if using stopwords
from nltk.stem import PorterStemmer  # Uncomment if using stemming
import re
import string
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from ydata_profiling import ProfileReport
import numpy as np
import xgboost as xgb
import ast


In [70]:
#Read the data


data=pd.read_pickle("/Users/kedarkanhere/Downloads/email_campaigns.pkl")


In [71]:
#Check the data type
type(data)

list

In [72]:
#See what the data holds
data[0]

{'example1': [{'subject': '🚀 Propel Your Marketing ROI with Advanced Analytics!',
   'body': "Hey [Recipient's Name],\n\nReady to see your marketing performance soar? With our cutting-edge Marketing Analytics services, your business can harness the power of data to drive decision-making and skyrocket ROI!\n\nQuick question - are you leveraging your data to its full potential? Let's talk strategy! 👉 [meeting link]\n\nBest,\n[Your Name]",
   'opened': False,
   'meeting link clicked': False,
   'responded': False},
  {'subject': 'Data is Your Superpower 📊 Unlock Insights with Us',
   'body': "Hi [Recipient's Name],\n\nI'm reaching out again because I believe our last message might've slipped through the cracks. With your goals in mind, our marketing analytics can translate your data into actionable strategies that can redefine your marketing's success.\n\nCan we help you harness this superpower? Book a time on my calendar: [meeting link]. Looking forward to connecting!\n\nCheers,\n[Your 

In [73]:
#It is a nested json with 5 variables inside it. Let us convert it into a tabular dataframe

#Create the df in which you want your data to be present


all_keys = set(key for dct in data for key in dct.keys())

# Ensure each dictionary has all keys
for dct in data:
    for key in all_keys:
        dct.setdefault(key, np.nan)  # Set default value for missing keys

# Create DataFrame
df = pd.DataFrame(data)




def combine_row_values(row):
    # Filter out NaN values and convert to list
    values = row.dropna().tolist()
    # Combine values (modify this part as needed, e.g., join with a separator)
    combined = ' '.join(map(str, values))  # Example: joining with space
    return combined

# Apply the function across the rows (axis=1)
df['combined'] = df.apply(combine_row_values, axis=1)


# Convert the column of dictionaries to a list
list_of_dicts = df['combined'].tolist()


df=pd.DataFrame()
for i in range(0,len(list_of_dicts)):
    
    try:
        df=pd.concat([df,pd.DataFrame(ast.literal_eval(list_of_dicts[i]))])
    except:
        print(i)
        continue



21


In [74]:
df

Unnamed: 0,subject,body,opened,meeting link clicked,responded,meeting_link_clicked
0,🚀 Propel Your Marketing ROI with Advanced Anal...,"Hey [Recipient's Name],\n\nReady to see your m...",0,False,0,
1,Data is Your Superpower 📊 Unlock Insights with Us,"Hi [Recipient's Name],\n\nI'm reaching out aga...",1,False,0,
2,Turn Marketing Data Into Decisions 🧐 Let's Exp...,"Greetings [Recipient's Name],\n\nWe haven't co...",0,False,0,
3,Marketing Success is a Click Away - Let's Chat...,"Hello [Recipient's Name],\n\nIt's clear that y...",1,True,0,
0,🚀 Boost Your Brand's Visibility with Proven Ma...,"Hi [Recipient's Name],\n\nIn the digital age, ...",0,False,0,
...,...,...,...,...,...,...
3,Streamline HR and Boost Performance 💼,"Hello [Recipient's Name],\n\nEfficiency is kin...",1,True,0,
0,Boost Your Brand Visibility 🚀,"Hello [Recipient's Name],\n\nWant to skyrocket...",1,False,0,
1,🔍 Outpace Competitors with Insightful Analytics,"Hi [Recipient's Name],\n\nI noticed you're nav...",1,False,0,
2,Tailored Analytics for You,"Hey [Recipient's Name],\n\nYour brand deserves...",0,False,0,


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150 entries, 0 to 3
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   subject               150 non-null    object
 1   body                  150 non-null    object
 2   opened                150 non-null    int64 
 3   meeting link clicked  134 non-null    object
 4   responded             150 non-null    int64 
 5   meeting_link_clicked  16 non-null     object
dtypes: int64(2), object(4)
memory usage: 8.2+ KB


In [77]:
#Here we have 150 rows across 6 columns. Now that our data is tabular let us start our analysis
# 1 column is a duplicate, let us fix that

df['link_clicked'] = np.where(df['meeting link clicked'].notna(), df['meeting link clicked'], df['meeting_link_clicked'])


#Let us look at the categorical variables first
df[["opened",'link_clicked', 'responded']].value_counts()


opened  link_clicked  responded
1       False         0            60
        True          0            44
0       False         0            39
1       False         1             6
        True          1             1
Name: count, dtype: int64

In [47]:
#As we can clearly see above,  the data is not evenly 50-50 split across most categories

In [78]:
#Now let us do pandas profiling post which we will start our text analysis with data cleaning
profile = ProfileReport(df, title="Profiling Report")

In [79]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [80]:
#Now let us start cleansing the data

def clean_text(text):
    # Remove smileys and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Optional: Remove stopwords and apply stemming
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text

# Clean 'subject' and 'body' columns
df['subject'] = df['subject'].apply(clean_text)
df['body'] = df['body'].apply(clean_text)

# Combine 'subject' and 'body' columns
df['combined_text'] = df['subject'] + " " + df['body']


In [81]:
#We will also create n grams

# Define a function to create n-grams
def create_ngrams(text, n):
    return list(ngrams(text.split(), n))

# Choose the n for n-grams (e.g., bigrams: n=2)
n = 4


In [82]:
# Apply the function to your combined text column
df['ngrams'] = df['combined_text'].apply(lambda x: create_ngrams(x, n))


In [83]:
# Aggregate ngrams for opened and not opened emails
opened_ngrams = df[df['opened'] == 1]['ngrams'].sum()
not_opened_ngrams = df[df['opened'] == 0]['ngrams'].sum()


In [84]:
# Count the frequency of n-grams
opened_ngrams_freq = Counter(opened_ngrams)
not_opened_ngrams_freq = Counter(not_opened_ngrams)


In [85]:
# Most common n-grams in opened and not opened emails
most_common_opened = opened_ngrams_freq.most_common(10)
most_common_not_opened = not_opened_ngrams_freq.most_common(10)


In [86]:
most_common_opened

[(('meet', 'link', 'best', 'name'), 20),
 (('meet', 'link', 'cheer', 'name'), 16),
 (('meet', 'link', 'warm', 'regard'), 13),
 (('link', 'warm', 'regard', 'name'), 13),
 (('hi', 'recipi', 'name', 'notic'), 13),
 (('hello', 'recipi', 'name', 'notic'), 8),
 (('call', 'meet', 'link', 'best'), 7),
 (('let', 'talk', 'meet', 'link'), 7),
 (('recipi', 'name', 'im', 'reach'), 6),
 (('analyt', 'hello', 'recipi', 'name'), 6)]

In [87]:
most_common_not_opened

[(('meet', 'link', 'best', 'name'), 13),
 (('analyt', 'hey', 'recipi', 'name'), 5),
 (('hi', 'recipi', 'name', 'notic'), 5),
 (('recipi', 'name', 'notic', 'check'), 4),
 (('call', 'meet', 'link', 'best'), 3),
 (('analyt', 'hi', 'recipi', 'name'), 3),
 (('transform', 'data', 'action', 'insight'), 3),
 (('meet', 'link', 'cheer', 'name'), 3),
 (('im', 'sure', 'your', 'juggl'), 3),
 (('sure', 'your', 'juggl', 'lot'), 3)]

In [88]:
#As we can clearly see above, we can find which words are essentially making the use open the email

#Now let us prepare the data for the model
df["link_clicked"]=df["link_clicked"].astype(int)

In [89]:
#Now let us start preprocessing the data for model


# Preprocessing
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf.fit_transform(df['combined_text']).toarray()
y = df[['opened', 'link_clicked']]


In [102]:

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [103]:

model = MultiOutputClassifier(xgb.XGBClassifier(objective='binary:logistic', n_estimators=100))



In [104]:

# Training the model
model.fit(X_train, y_train)


In [105]:

# Predicting
y_pred = model.predict(X_test)


In [106]:

# Evaluating the model
print(classification_report(y_test, y_pred, target_names=['email_opened', 'link_clicked']))


              precision    recall  f1-score   support

email_opened       0.75      0.86      0.80        35
link_clicked       0.83      0.38      0.53        13

   micro avg       0.76      0.73      0.74        48
   macro avg       0.79      0.62      0.66        48
weighted avg       0.77      0.73      0.73        48
 samples avg       0.69      0.60      0.63        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
