# reinforcement learning email problem

suppose i want to send out an email campaign, but i don't know how many times to send an email to a person or how to space them out.

we can assume that there is a point where sending more emails will not increase the probability of the person clicking on the email, and it can even have a negative effect.


therefore, we want to maximize the expected reward of the person clicking on the email, while minimizing the number of emails sent.

In [6]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Generate sample data
np.random.seed(42)

# Create 100 users with different response patterns
n_users = 100

# Generate user data
users = pd.DataFrame({
    'user_id': range(n_users),
    # Some users are more likely to respond to emails than others
    'base_response_rate': np.random.beta(2, 5, n_users),  
    # How quickly users get annoyed with frequent emails (lower = gets annoyed faster)
    'patience_factor': np.random.uniform(0.5, 0.9, n_users),
    # How long until response probability starts declining
    'optimal_gap_days': np.random.randint(3, 14, n_users)
})

# Function to simulate email response probability
def get_response_probability(user_row, n_previous_emails, days_since_last):
    base_rate = user_row['base_response_rate']
    patience = user_row['patience_factor']
    optimal_gap = user_row['optimal_gap_days']
    
    # Decay based on number of previous emails
    email_fatigue = patience ** n_previous_emails
    
    # Penalty for suboptimal timing
    timing_factor = np.exp(-abs(days_since_last - optimal_gap) / optimal_gap)
    
    return base_rate * email_fatigue * timing_factor

# Generate some example probabilities
example_user = users.iloc[0]
print("\nExample response probabilities for first user:")
print(f"Base response rate: {example_user['base_response_rate']:.3f}")
print(f"After 1 email: {get_response_probability(example_user, 1, 7):.3f}")
print(f"After 5 emails: {get_response_probability(example_user, 5, 7):.3f}")
print(f"After 10 emails: {get_response_probability(example_user, 10, 7):.3f}")



Example response probabilities for first user:
Base response rate: 0.354
After 1 email: 0.077
After 5 emails: 0.036
After 10 emails: 0.014


In [9]:
# Q-learning parameters
n_episodes = 1000
learning_rate = 0.1
discount_factor = 0.95
epsilon = 0.1  # For epsilon-greedy exploration

# Initialize Q-table: state = (user_id, days_since_last), action = wait/send
max_days_window = 30  # Maximum days to track since last email
n_actions = 2  # 0: wait, 1: send email
Q = np.zeros((n_users, max_days_window, n_actions))

# Training the Q-learning algorithm
learned_optimal_gaps = []

for user_id in range(n_users):
    user = users.iloc[user_id]
    
    for episode in range(n_episodes):
        days_since_last = 0
        n_emails_sent = 0
        total_reward = 0
        done = False
        
        while not done:
            # Current state
            state = (user_id, min(days_since_last, max_days_window-1))
            
            # Epsilon-greedy action selection
            if np.random.random() < epsilon:
                action = np.random.randint(0, n_actions)
            else:
                action = np.argmax(Q[state[0], state[1]])
            
            # Take action and observe reward
            if action == 0:  # Wait
                days_since_last += 1
                reward = 0
                done = days_since_last >= max_days_window
            else:  # Send email
                response_prob = get_response_probability(user, n_emails_sent, days_since_last)
                response = np.random.random() < response_prob
                reward = 1 if response else -0.1
                n_emails_sent += 1
                days_since_last = 0
                done = n_emails_sent >= 20  # Limit emails per episode
            
            # Update Q-value
            next_state = (user_id, min(days_since_last, max_days_window-1))
            next_max_q = np.max(Q[next_state[0], next_state[1]])
            Q[state[0], state[1], action] += learning_rate * (
                reward + discount_factor * next_max_q - Q[state[0], state[1], action]
            )
            
            total_reward += reward
    
    # Find optimal gap for this user
    days = 0
    max_value = float('-inf')
    optimal_gap = 0
    
    for d in range(max_days_window):
        value = Q[user_id, d, 1]  # Value of sending email after d days
        if value > max_value:
            max_value = value
            optimal_gap = d
    
    learned_optimal_gaps.append(optimal_gap)

# Compare learned vs actual optimal gaps
results = pd.DataFrame({
    'user_id': users['user_id'],
    'learned_gap': learned_optimal_gaps,
    'actual_gap': users['optimal_gap_days']
})

print("\nComparison of learned vs actual optimal gaps:")
print(results.head())



Comparison of learned vs actual optimal gaps:
   user_id  learned_gap  actual_gap
0        0            0           3
1        1            9           7
2        2            4          11
3        3            1           3
4        4            4           5


In [10]:
# Let's try a few improvements:
# 1. Increase training episodes for better convergence
# 2. Add epsilon-greedy exploration
# 3. Decay learning rate over time
# 4. Use a larger penalty for unsuccessful emails

n_episodes = 2000  # Increased from previous value
epsilon = 0.3  # For exploration
learning_rate_start = 0.1
learning_rate_end = 0.01
min_penalty = -0.5  # Increased penalty for unsuccessful emails

learned_optimal_gaps = []

for user_id, user in users.iterrows():
    Q = np.zeros((len(users), max_days_window, 2))
    
    for episode in range(n_episodes):
        state = (user_id, 0)  # (user_id, days_since_last)
        days_since_last = 0
        n_emails_sent = 0
        total_reward = 0
        done = False
        
        # Decay learning rate
        learning_rate = learning_rate_start - (learning_rate_start - learning_rate_end) * (episode / n_episodes)
        
        while not done:
            # Epsilon-greedy action selection
            if np.random.random() < epsilon:
                action = np.random.choice([0, 1])
            else:
                action = np.argmax(Q[state[0], state[1]])
            
            # Take action and observe reward
            if action == 0:  # Wait
                days_since_last += 1
                reward = 0
                done = days_since_last >= max_days_window
            else:  # Send email
                response_prob = get_response_probability(user, n_emails_sent, days_since_last)
                response = np.random.random() < response_prob
                reward = 1 if response else min_penalty
                n_emails_sent += 1
                days_since_last = 0
                done = n_emails_sent >= 20
            
            next_state = (user_id, min(days_since_last, max_days_window-1))
            next_max_q = np.max(Q[next_state[0], next_state[1]])
            Q[state[0], state[1], action] += learning_rate * (
                reward + discount_factor * next_max_q - Q[state[0], state[1], action]
            )
            
            state = next_state
            total_reward += reward
    
    # Find optimal gap for this user
    optimal_gap = np.argmax([Q[user_id, d, 1] for d in range(max_days_window)])
    learned_optimal_gaps.append(optimal_gap)

# Compare results with improved version
results = pd.DataFrame({
    'user_id': users['user_id'],
    'learned_gap': learned_optimal_gaps,
    'actual_gap': users['optimal_gap_days']
})

print("\nComparison of learned vs actual optimal gaps (with improvements):")
print(results.head())
print("\nMean absolute error:", np.mean(np.abs(results['learned_gap'] - results['actual_gap'])))



Comparison of learned vs actual optimal gaps (with improvements):
   user_id  learned_gap  actual_gap
0        0            3           3
1        1            9           7
2        2           12          11
3        3            4           3
4        4            4           5

Mean absolute error: 4.39
