In [1]:
import pandas as pd

## Data Columns

- country : user country based on the IP address
- age : user age. Self-reported at sign-in step
- new_user : whether the user created the account during this session or had already an account and simply came back to the site
- source : marketing channel source
    - Ads: came to the site by clicking on an advertisement
    - Seo: came to the site by clicking on search results
    - Direct: came to the site by directly typing the URL on the browser
- total_pages_visited: number of total pages visited during the session. This is a proxy for time spent on site and engagement during the session.
- converted: this is our label. 1 means they converted within the session, 0 means they left without buying anything. 


In [2]:
df = pd.read_csv('conversion_data.csv')
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,UK,25,1,Ads,1,0
1,US,23,1,Seo,5,0
2,US,28,1,Seo,4,0
3,China,39,1,Seo,5,0
4,US,30,1,Seo,6,0


## Conversion Rate
**The company goal is to increase conversion rate: # conversions / total sessions.**

In [7]:
cr = sum(df.converted) / len(df.converted)
cr_percentage = "Conversion Rate: {:.2%}".format(cr)
cr_percentage

'Conversion Rate: 3.23%'

In [35]:
# Define a function to get CR by different categories
def get_cr(df, by_col=None):
    df_grouped = df.groupby(by_col)
    # cr = df_grouped['converted'].sum() / df_grouped['converted'].count()
    # return cr
    res = df_grouped.agg({
        'converted': ['count', 'mean'] 
    })
    # res.columns = [f"{col[0]}_{col[1]}" for col in res.columns]
    res.columns = ['N', 'CR']
    return res

### CR by source

In [36]:
get_cr(df, 'source')

Unnamed: 0_level_0,N,CR
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Ads,88740,0.034483
Direct,72420,0.028169
Seo,155040,0.032895


### CR by country

In [37]:
get_cr(df, 'country')

Unnamed: 0_level_0,N,CR
country,Unnamed: 1_level_1,Unnamed: 2_level_1
China,76602,0.001332
Germany,13056,0.0625
UK,48450,0.052632
US,178092,0.037801


### CR by new-user

In [38]:
get_cr(df, 'new_user')

Unnamed: 0_level_0,N,CR
new_user,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99456,0.072002
1,216744,0.014021


### CR by age_grouped

In [39]:
bins = [0, 25, 35, 45, 55, 200]
df['age_grouped'] = pd.cut(df['age'], bins=bins, labels=['Below 25', '[25,35)', '[35,45)', '[45,55)', 'Above 55'], 
                           right=False)
get_cr(df, 'age_grouped')

Unnamed: 0_level_0,N,CR
age_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1
Below 25,83531,0.055393
"[25,35)",136258,0.030692
"[35,45)",77448,0.015843
"[45,55)",17413,0.008844
Above 55,1550,0.006452


### CR by pv_grouped

In [43]:
bins = [0, 8, 12, 16, 20, 50]
df['pv_grouped'] = pd.cut(df['total_pages_visited'], bins=bins, 
                           labels=['Below 8', '[8,12)', '[12,16)', '[16,20)', 'Above 20'], 
                           right=False)
get_cr(df, 'pv_grouped')

Unnamed: 0_level_0,N,CR
pv_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1
Below 8,259539,0.001437
"[8,12)",43147,0.04239
"[12,16)",9176,0.425131
"[16,20)",3232,0.925743
Above 20,1106,0.999096
