## What does a data scientist on a marketing team do? 
- Analizing marketing campaign performance
- Attributing credit for conversions to marketing channels
- A/B testing

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
import os

marketing = pd.read_csv('marketing_new.csv')
marketing.info()

In [None]:
marketing.describe()

In [None]:
marketing.head()

In [103]:
cat_columns = ['marketing_channel', 'subscribing_channel', 'age_group', 'language_displayed', 'language_preferred', 'variant']
for col in cat_columns:
    marketing[col] = marketing[col].astype('category')

date_columns = ['date_served', 'date_subscribed', 'date_canceled']
for date_col in date_columns:
    marketing[date_col] = pd.to_datetime(marketing[date_col], errors='coerce')

bool_columns = ['converted', 'is_retained']
for col in bool_columns:
    marketing[col] = marketing[col].astype('bool')

In [None]:
marketing.head()


In [105]:
marketing["is_house_ads"] = np.where(marketing["marketing_channel"] == "House Ads",
                                     True, 
                                     False)


In [None]:
marketing.head()

In [None]:
channel_dict={"House Ads": 1, "Instagram": 2, "Facebook": 3, "Email": 4, "Push": 5}
marketing['channel_code'] = marketing['marketing_channel'].map(channel_dict)

print(marketing['channel_code'].value_counts())

In [None]:
marketing_parsed_dates = pd.read_csv("marketing.csv",
                                     parse_dates=["date_served", "date_subscribed",
                                                  "date_canceled"])
marketing_parsed_dates.info()

In [None]:
marketing_parsed_dates.head()

In [None]:
marketing["day_served"] = marketing["date_served"].dt.dayofweek
marketing[["day_served", "date_served"]].sample(10)

## Initial Exploratory Analysis

In [None]:
# How many unique users see marketing assets each day?
print(f"Datapoints: {marketing.shape[0]}")
print(f"Unique users: {marketing['user_id'].nunique()}")

daily_unique_users = marketing.groupby('date_served')['user_id'].nunique()

daily_unique_users.plot(kind='line', title='Daily Unique Users')
plt.xlabel('Date')
plt.ylabel('Unique Users')
plt.xticks(rotation=45)
plt.show()


## Common marketing metrics and how to measure them

Conversion rate is a common way to measure campaign success and its calculated as how many people bought the product out of the people reached by the marketing campaign.
Retention is another important metric and harder to calculate.


In [None]:
# Conversion rate
subscribers = marketing[marketing["converted"] == True]['user_id'].nunique()
total_users = marketing['user_id'].nunique()

conversion_rate = subscribers / total_users
print(f"Conversion rate: {conversion_rate:.2%}")

In [None]:
# Retention rate (1 month)
retained = marketing[marketing["is_retained"] == True]["user_id"].nunique()
retention_rate = retained / total_users

print(f"Retention rate: {retention_rate:.2%}")

In [None]:
marketing.converted

## Customer segmentation

Segmentation of customers can be done based on age, gender, location, channel, past interactions with business, etc. We can then calculate metrics for each segment.

In [None]:
# Retention rate for house ads
house_ads = marketing[marketing['subscribing_channel']=='House Ads']

retained = house_ads[house_ads["is_retained"] == True]["user_id"].nunique()
total_users = house_ads["user_id"].nunique()

conversion_rate = retained / total_users
print(f"Retention rate for house ads: {conversion_rate:.2%}")


In [None]:
# Retention rate for every channel 
retained = (
    marketing[marketing["is_retained"] == True]
    .groupby("subscribing_channel")["user_id"]
    .nunique()
)
subscribed = (
    marketing[marketing["converted"] == True]
    .groupby("subscribing_channel")["user_id"]
    .nunique()
)
channel_retention_rate = retained / subscribed

# Print retention rate for every channel
print("Retention rate for every channel:")
for channel, rate in channel_retention_rate.items():
    print(f"{channel}: {rate:.2%}")

## Plotting campaign results

In [None]:
channel_retention_rate.plot(kind='bar', title='Retention Rate by Channel')
plt.xlabel('Channel')
plt.ylabel('Retention Rate')

In [None]:
subscribed = marketing.groupby("date_subscribed")["user_id"].nunique()

retained = (
    marketing[marketing["is_retained"] == True]
    .groupby("date_subscribed")["user_id"]
    .nunique()
)

# Calculate the conversion rate per day
daily_retention_rate = retained / subscribed
print(daily_retention_rate)


In [None]:
# Lets convert the series to a dataframe
daily_retention_rate_df = pd.DataFrame(daily_retention_rate.reset_index())
daily_retention_rate_df.columns = ["date_subscribed", "retention_rate"]

# Plot the retention rate dataframe
daily_retention_rate_df.plot(
    x="date_subscribed", y="retention_rate", kind="line", title="Daily Retention Rate"
)
plt.xlabel('Date')
plt.ylabel('Retention Rate')
plt.ylim(0)
plt.show()

## Grouping by multiple columns 

In [None]:
language = marketing.groupby(['date_served', 'language_preferred'])['user_id'].count()
language


In [None]:
language.unstack(level=1).plot(kind='line', title='Daily Users by Language')
plt.legend(title='Language', loc='upper right')
plt.show()

In [None]:
language_age = marketing.groupby(["language_preferred", 'age_group'])["user_id"].count()
language_age.unstack(level=1).plot(kind="bar", title="Users by Language and Age Group")
plt.xlabel("Language")
plt.ylabel("Users")
plt.show()

## Using functions to calculate retention

In [123]:
def retention_rate(df, column_names): 
    retained = df[df['is_retained'] == True].groupby(column_names)['user_id'].nunique()
    subscribed = df[df['converted'] == True].groupby(column_names)['user_id'].nunique()

    retention_rate = retained / subscribed

    return retention_rate

In [None]:
retention_rate(marketing, "language_preferred")


In [None]:
lang_age = retention_rate(marketing, ["language_preferred", 'age_group'])


In [None]:
lang_age.unstack(level=1)

## Identifying inconsistencies

In [None]:
marketing

In [None]:
users_dow = marketing.groupby('day_served')['user_id'].nunique()
plt.plot(users_dow.index, users_dow.values)
plt.xlabel('Day of Week')
plt.ylabel('N Users')
plt.title('Users per Day of Week')
plt.show()


In [129]:
retention_dow = retention_rate(marketing, ['day_served'])

In [None]:
retention_dow

In [131]:
def plotting_conv(dataframe):
    for column in dataframe:
        # Plot column by dataframe's index
        plt.plot(dataframe.index, dataframe[column])
        plt.title("Daily " + str(column) + " conversion rate\n", size=16)
        plt.ylabel("Conversion rate", size=14)
        plt.xlabel("Date", size=14)
        # Show plot
        plt.show()
        plt.clf()

def conversion_rate(dataframe, column_names):
    # Total number of converted users
    column_conv = (
        dataframe[dataframe["converted"]].groupby(column_names)["user_id"].nunique()
    )

    # Total number users
    column_total = dataframe.groupby(column_names)["user_id"].nunique()

    # Conversion rate
    conversion_rate = column_conv / column_total

    # Fill missing values with 0
    conversion_rate = conversion_rate.fillna(0)
    return conversion_rate

In [None]:
# Calculate conversion rate by date served and age group
age_group_conv = conversion_rate(marketing, ["date_served", "age_group"])

# Unstack age_group_conv and create a DataFrame
age_group_df = pd.DataFrame(age_group_conv.unstack(level=1))

# Plot the results
plotting_conv(age_group_df)


In [None]:
# Calculate conversion rate by date served and channel
daily_conv_channel = conversion_rate(marketing, ["date_served", "marketing_channel"])

# Unstack daily_conv_channel and convert it to a DataFrame
daily_conv_channel = pd.DataFrame(daily_conv_channel.unstack(level=1))

# Plot results of daily_conv_channel
plotting_conv(daily_conv_channel)


In [None]:
# Add day of week column to marketing
marketing["DoW_served"] = marketing["date_served"].dt.dayofweek

# Calculate conversion rate by day of week
DoW_conversion = conversion_rate(marketing, ["DoW_served", "marketing_channel"])


# Unstack channels
DoW_df = pd.DataFrame(DoW_conversion.unstack(level=1))

# Plot conversion rate by day of week
DoW_df.plot(kind="line")
plt.title("Conversion rate by day of week\n")
plt.ylim(0)
plt.show()


In [None]:
# Add the new column is_correct_lang
house_ads["is_correct_lang"] = np.where(
    house_ads["language_displayed"] == house_ads["language_preferred"], "Yes", "No"
)

# Groupby date_served and correct_language
language_check = house_ads.groupby(["date_served", "is_correct_lang"])[
    "is_correct_lang"
].count()

# Unstack language_check and fill missing values with 0's
language_check_df = pd.DataFrame(language_check.unstack(level=1)).fillna(0)

# Print results
print(language_check_df)


In [None]:
# Divide the count where language is correct by the row sum
language_check_df["pct"] = language_check_df["Yes"] / language_check_df.sum(axis=1)

# Plot and show your results
plt.plot(language_check_df.index, language_check_df["pct"])
plt.show()


## Resolving Inconsistencies

## AB Testing for marketing

A/B testing refers to a randomized experiment which evaluates which variant performs better. 
In order for our tests to have meaning, we must have a clear control. 
The control should be something that currently exists and is running in production. 
Each variant in the test should have only one major change from the control; otherwise, it will be impossible to parse what led to the change in your key metrics. 
Prior to beginning a test, you must develop a hypothesis and determine which metric you are trying to impact. 
Always set key metrics ahead of running the test. 
It's easy to redefine success in retrospect, especially if you are under pressure to find a positive result. 
If you document success metrics ahead of time, you can maintain clarity around the success of the test.


In [None]:
email = marketing[marketing['marketing_channel'] == 'Email']
allocation = email.groupby(['variant'])['user_id'].nunique()

allocation.plot(kind='bar', title='Email Variant Allocation')
plt.xlabel('Variant')
plt.ylabel('Users')
plt.show()


In [None]:
subscribers = email.groupby(['user_id', 'variant'])['converted'].max()

subscribers

In [None]:
subscribers = pd.DataFrame(subscribers.unstack(level=1))


In [158]:
control = subscribers["control"].dropna()
personalization = subscribers["personalization"].dropna()


In [None]:
print("Conversion rate CONTROL:", np.mean(control))
print("Conversion rate PERSO:", np.mean(personalization))


## Calculating Lift and significance level

$$
\text{Lift} = \frac{\text{Treatment conversion rate} - \text{Control conversion rate}}{\text{Control conversion rate}}
$$

In [None]:
lift = (np.mean(personalization) - np.mean(control) ) / np.mean(control)

print(f"Lift: {lift:.2%}")


In [None]:
from scipy.stats import ttest_ind 

# Convert boolean values to integers
control_int = control.astype(int)
personalization_int = personalization.astype(int)

t = ttest_ind(control_int, personalization_int)

print(t)