# AGENDA
- Import Modules
- Import Data

## IMPORT PYTHON MODULES

In [2]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt

## IMPORT AND VALIDATE DATA 

In [3]:
warnings.filterwarnings("ignore")

#import data
data = r'conversion_paths.csv'
cp = pd.read_csv(data)

cp.head()

Unnamed: 0,user_pseudo_id,medium_path,conversion_flag,first_touchpoint,conversion_timestamp
0,flN8vT0gmTR1v8Ixommd1+u8s4ZvO8No7p7wl0X1v6Y+4z...,,0,2022-12-28 00:00:01.914383 UTC,
1,rgJvzjwn131LAGM5p0MZLCniNp8xX9VnGk861Dgu1wXNAA...,,0,2022-12-28 00:00:04.792183 UTC,
2,+JIUpNSpBdH11qifFKE/DwVZ5jozE6X2jJULmP3SZ0APUl...,,0,2022-12-28 00:00:33.063 UTC,
3,xO4dP3GBXDpOxHNwFIDCBnMKI1cBhsdZH+Eru7eyEoc0o8...,,1,2022-12-28 00:00:52.439504 UTC,2023-01-23 14:16:55.060907 UTC
4,nP78ikNKi2iu1IJUO+LWIPcoByh3ZWFvlIzIl174i1jCuK...,paidsocial,0,2022-12-28 00:00:52.722434 UTC,


In [4]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173982 entries, 0 to 173981
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   user_pseudo_id        173982 non-null  object
 1   medium_path           71009 non-null   object
 2   conversion_flag       173982 non-null  int64 
 3   first_touchpoint      173982 non-null  object
 4   conversion_timestamp  5475 non-null    object
dtypes: int64(1), object(4)
memory usage: 6.6+ MB


# CLEANING DATA

In [5]:
# Remove users without medium path info
data_filtered = cp[cp['medium_path'].notnull()]
data_filtered['first_touchpoint'] = pd.to_datetime(data_filtered['first_touchpoint'])
data_filtered['conversion_timestamp'] = pd.to_datetime(data_filtered['conversion_timestamp'])

In [6]:
data_filtered.head()

Unnamed: 0,user_pseudo_id,medium_path,conversion_flag,first_touchpoint,conversion_timestamp
4,nP78ikNKi2iu1IJUO+LWIPcoByh3ZWFvlIzIl174i1jCuK...,paidsocial,0,2022-12-28 00:00:52.722434+00:00,NaT
5,Kkx/yH2k8X5rWdx+wG7DPuxpEyCsvSNwDqr4BzWGpxQw38...,cpc,0,2022-12-28 00:01:01.875587+00:00,NaT
6,brAeKj/gaKj8RkzQBIJfe/5XDKShTLwOcTh6yXQink4Kuo...,organic,0,2022-12-28 00:01:15.005975+00:00,NaT
9,4i0jYrhlk2CNR94hRIat8WPFLkU8dRdcPgKA1oDlwsWbUe...,cpc,0,2022-12-28 00:02:08.431349+00:00,NaT
10,JDClr9MBvuQkNi/PeCvCFScJ0NL/3qIlq0ZWVYzoGSbsNC...,paidsocial,0,2022-12-28 00:02:24.791342+00:00,NaT


# DATA PREPROCESSING

In [7]:
converted_users = data_filtered[data_filtered['conversion_flag'] == 1]

expanded_paths_conv_user = converted_users['medium_path'].str.split(',', expand=True)

not_converted_users = data_filtered[data_filtered['conversion_flag'] == 0]

expanded_paths_non_conv_user = not_converted_users['medium_path'].str.split(',', expand=True)



# DATA ANALYSIS 

## Path Length Analysis for converted users:

In [8]:
# Calculate path lengths
path_lengths = expanded_paths_conv_user.count(axis=1)

total_num_of_path_length = path_lengths.count()

percent = path_lengths.value_counts() / total_num_of_path_length * 100

# Create a DataFrame for path length analysis
path_length_analysis = pd.DataFrame({
    'Path Length': path_lengths.value_counts(),
    'total of %': percent
})

# Calculate the average path length
average_path_length = path_length_analysis['Path Length'].mean()

# Print the distribution of path lengths
print("Distribution of Path Lengths:")
print(path_length_analysis.head(30))


Distribution of Path Lengths:
    Path Length  total of %
1          2245   64.345085
2           581   16.652336
3           269    7.709946
4           131    3.754657
5            76    2.178275
6            59    1.691029
7            30    0.859845
8            21    0.601892
10           15    0.429923
9            14    0.401261
13            8    0.229292
11            6    0.171969
15            6    0.171969
12            5    0.143308
19            4    0.114646
21            2    0.057323
17            2    0.057323
42            2    0.057323
16            1    0.028662
22            1    0.028662
23            1    0.028662
33            1    0.028662
27            1    0.028662
28            1    0.028662
74            1    0.028662
34            1    0.028662
25            1    0.028662
18            1    0.028662
29            1    0.028662
20            1    0.028662


In [14]:
# Calculate the cumulative sum of the "total of %" column
path_length_analysis['cumulative %'] = path_length_analysis['total of %'].cumsum()

# Filter the DataFrame to include rows where the cumulative sum is less than or equal to 95%
filtered_df = path_length_analysis[path_length_analysis['cumulative %'] <= 99]

filtered_df

Unnamed: 0,Path Length,total of %,cumulative %
1,2245,64.345085,64.345085
2,581,16.652336,80.99742
3,269,7.709946,88.707366
4,131,3.754657,92.462024
5,76,2.178275,94.640298
6,59,1.691029,96.331327
7,30,0.859845,97.191172
8,21,0.601892,97.793064
10,15,0.429923,98.222987
9,14,0.401261,98.624248


### not converted users

In [48]:
# Calculate path lengths
path_lengths = expanded_paths_non_conv_user.count(axis=1)

total_num_of_path_length = path_lengths.count()

percent = path_lengths.value_counts() / total_num_of_path_length * 100

# Create a DataFrame for path length analysis
path_length_analysis = pd.DataFrame({
    'Path Length': path_lengths.value_counts(),
    'total of %': percent
})

# Calculate the average path length
average_path_length = path_length_analysis['Path Length'].mean()

# Print the distribution of path lengths
print("Distribution of Path Lengths:")
print(path_length_analysis.head(30))


Distribution of Path Lengths:
    Path Length  total of %
1         55119   81.633590
2          6636    9.828199
3          2401    3.555983
4          1173    1.737263
5           666    0.986374
6           348    0.515403
7           259    0.383590
8           183    0.271031
9           121    0.179206
10           97    0.143661
11           71    0.105154
12           57    0.084419
13           43    0.063685
14           35    0.051836
15           33    0.048874
20           27    0.039988
19           26    0.038507
17           25    0.037026
18           22    0.032583
16           21    0.031102
21           15    0.022216
24           15    0.022216
26           14    0.020735
25           12    0.017773
23           12    0.017773
22           12    0.017773
29            9    0.013329
27            7    0.010367
28            5    0.007405
35            4    0.005924


In [7]:
# Function to split and expand medium paths
def expand_medium_path(path):
    if pd.notnull(path):
        steps = path.split(',')
        return pd.Series(steps)
    else:
        return pd.Series([None])

# Apply the function to the medium_path column
expanded_paths = data_filtered['medium_path'].apply(expand_medium_path)

expanded_data = pd.concat([data_filtered, expanded_paths], axis=1)

# Define the maximum number of steps
max_steps = 111

# Create a list of column names
column_names = ['user_pseudo_id', 'medium_path', 'conversion_flag', 'first_touchpoint', 'conversion_timestamp']
for i in range(1, max_steps + 1):
    column_names.append(f'step{i}')

# Assign the column names to the DataFrame
expanded_data.columns = column_names

#expanded_data.head(15)

In [8]:
expanded_data.head(15)

Unnamed: 0,user_pseudo_id,medium_path,conversion_flag,first_touchpoint,conversion_timestamp,step1,step2,step3,step4,step5,...,step102,step103,step104,step105,step106,step107,step108,step109,step110,step111
4,nP78ikNKi2iu1IJUO+LWIPcoByh3ZWFvlIzIl174i1jCuK...,paidsocial,0,2022-12-28 00:00:52.722434+00:00,NaT,paidsocial,,,,,...,,,,,,,,,,
5,Kkx/yH2k8X5rWdx+wG7DPuxpEyCsvSNwDqr4BzWGpxQw38...,cpc,0,2022-12-28 00:01:01.875587+00:00,NaT,cpc,,,,,...,,,,,,,,,,
6,brAeKj/gaKj8RkzQBIJfe/5XDKShTLwOcTh6yXQink4Kuo...,organic,0,2022-12-28 00:01:15.005975+00:00,NaT,organic,,,,,...,,,,,,,,,,
9,4i0jYrhlk2CNR94hRIat8WPFLkU8dRdcPgKA1oDlwsWbUe...,cpc,0,2022-12-28 00:02:08.431349+00:00,NaT,cpc,,,,,...,,,,,,,,,,
10,JDClr9MBvuQkNi/PeCvCFScJ0NL/3qIlq0ZWVYzoGSbsNC...,paidsocial,0,2022-12-28 00:02:24.791342+00:00,NaT,paidsocial,,,,,...,,,,,,,,,,
11,+kb+nwGF9iTcHXXgvHszlWeQLO1dxXlncw4YLR3JEBVJyj...,paidsocial,0,2022-12-28 00:03:45.109434+00:00,NaT,paidsocial,,,,,...,,,,,,,,,,
13,jNPY7CFHQQ357ZqBTiMlYNfX2FFoGs9AIBTEG4SKP54Dlg...,gotoweb,0,2022-12-28 00:04:45.740631+00:00,NaT,gotoweb,,,,,...,,,,,,,,,,
14,RB8Yadge42beQcpxhZQlgHTPCnFDydSgrZapYMdWtNkHLg...,paidsocial,0,2022-12-28 00:05:38.993307+00:00,NaT,paidsocial,,,,,...,,,,,,,,,,
15,xSj9j9Y/Van7PMtBpdN0zTCx0/gqzV5hFA5wy9kswCB1CZ...,"cpc,email,cpc",0,2022-12-28 00:06:03.526154+00:00,NaT,cpc,email,cpc,,,...,,,,,,,,,,
17,Rk/Lfpr8aLlShdqmGheoaqU7aydKIskNFf91ovfFMniU5t...,"organic,organic,organic,organic,organic,organi...",0,2022-12-28 00:06:29.739224+00:00,NaT,organic,organic,organic,organic,organic,...,,,,,,,,,,


In [9]:
expanded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71009 entries, 4 to 173981
Columns: 116 entries, user_pseudo_id to step111
dtypes: datetime64[ns, UTC](2), int64(1), object(113)
memory usage: 63.4+ MB


## for conversions after one step which traffic sourse works better

In [None]:
toDo add CD as%

In [None]:
converted_users = expanded_data[expanded_data['conversion_flag'] == 1]

one_step_counts = converted_users[(converted_users['step2'].isnull())]['step1'].value_counts()
print(one_step_counts)

# Calculate the total number of conversions
total_conversions = one_step_counts.sum()

# Calculate percentages for each channel
percentages = (one_step_counts / total_conversions) * 100

one_step_counts.plot(kind='bar', stacked=True)
plt.xlabel("Channel")
plt.ylabel("Counts")
plt.title("One-Step Conversions by Channel")
plt.show()

# Define colors for each channel
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99', '#c2c2f0', '#ffb3e6', '#c4e17f', '#aaffc3']




# Step 3: Plot the pie chart
plt.figure(figsize=(6, 6))
plt.pie(percentages, labels=percentages.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Conversion Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

for conversions with one step path organic, cpc, and email works the best
maybe improve 25 %

but paidsocial have less conversions?? maybe it works better in sequence 

lets anallyse dipper

## Lets check conversion rates per traffic sourse 

### First click attribution

In [None]:
# Filter out rows with null medium_path
data_filtered = cp[cp['medium_path'].notnull()]
data_filtered['medium_steps'] = data_filtered['medium_path'].str.split(',')

total_impressions = expanded_data['step1'].value_counts()

df = data_filtered.explode('medium_steps')


#total_impressions = total_impressions[~total_impressions.index.isin(['undefined', 'influencer', 'affili' ])]

# Filter converted users
converted_users = expanded_data[expanded_data['conversion_flag'] == 1]

# Count occurrences of each medium path step (first step)
first_step_counts = converted_users['step1'].value_counts()

# Calculate total conversions
total_conversions = len(converted_users)


In [None]:

# Explode the medium_path to separate each step
df['medium_steps'] = data_filtered['medium_path'].str.split(',')
exploded_df = df.explode('medium_steps')

# Count the occurrences of each traffic source across all steps
traffic_source_counts = exploded_df['medium_steps'].value_counts()

# Calculate conversion rates for each traffic source (first step)
conversion_rates = first_step_counts / total_conversions * 100

# Create a DataFrame for first-click attribution analysis
first_click_attribution = pd.DataFrame({
    'Traffic Source': first_step_counts.index,
    'Conversions': first_step_counts,
    'Conversion Distribution Rate (%)': conversion_rates
})

#Leads devision by traffic sourse
# Print the conversion rates based on first-click attribution
print(first_click_attribution.sort_values(by='Conversion Distribution Rate (%)', ascending=False))



In [None]:

import matplotlib.pyplot as plt

# Plotting medium path conversion rates
plt.figure(figsize=(10, 6))
first_click_attribution.plot(kind='bar', x='Traffic Source', y='Conversion Distribution Rate (%)')
plt.title('Conversion Rates by Traffic Source')
plt.xlabel('Traffic Source')
plt.ylabel('Conversion Distribution Rate (%)')
plt.xticks(rotation=45)
plt.show()

# Calculate conversion rates for each traffic source
conversion_rates = (first_step_counts / total_impressions) * 100

# Create a DataFrame for conversion rate by traffic source analysis
conversion_rate_by_source = pd.DataFrame({
    #'Traffic Source': traffic_source_counts.index,
    'Conversions': first_step_counts,
    'Clicks': total_impressions,
    'Conversion Distribution Rate (%)': conversion_rates
})

#print(conversion_rate_by_source)
# Print the conversion rates by traffic source
print(conversion_rate_by_source.sort_values(by='Conversion Distribution Rate (%)', ascending=False))

# import matplotlib.pyplot as plt

# # Plotting medium path conversion rates
# plt.figure(figsize=(10, 6))
# conversion_rate_by_source.plot(kind='bar', x='Traffic Source', y='Conversion Rate (%)')
# plt.title('Conversion Rates by Traffic Source')
# plt.xlabel('Traffic Source')
# plt.ylabel('Conversion Rate (%)')
# plt.xticks(rotation=45)
# plt.show()

Lets also check last click attribution model

## Last click attribution

In [None]:
# Filter out rows with null medium_path
data_filtered = cp[cp['medium_path'].notnull()]
data_filtered['medium_steps'] = data_filtered['medium_path'].str.split(',')

# Ensure all steps are exploded into separate rows
df = data_filtered.explode('medium_steps')

data_filtered['last_step'] = data_filtered['medium_steps'].apply(lambda x: x[-1] if isinstance(x, list) else None)

total_impressions_ls = data_filtered['last_step'].value_counts()

# Filter out the rows with "undefined" and "influencer" traffic sources
#total_impressions_ls = total_impressions_ls[~total_impressions_ls.index.isin(['undefined', 'influencer', 'affili','a' ])]

# Print the updated DataFrame
#print(total_impressions_ls)

# Filter converted users
converted_users_ls = data_filtered[data_filtered['conversion_flag'] == 1]

# Count occurrences of each medium path step (first step)
last_step_counts = converted_users_ls['last_step'].value_counts()
#print(last_step_counts)

# Calculate conversion rates for each traffic source
conversion_rates = (last_step_counts / total_impressions) * 100

# Create a DataFrame for conversion rate by traffic source analysis
conversion_rate_by_source = pd.DataFrame({
    #'Traffic Source': traffic_source_counts.index,
    'Conversions': last_step_counts,
    'Clicks': total_impressions,
    'Conversion Rate (%)': conversion_rates
})

#print(conversion_rate_by_source)
# Print the conversion rates by traffic source
print(conversion_rate_by_source.sort_values(by='Conversion Rate (%)', ascending=False))

paidsocial works bad for both attribution models let's check how offen users come from this ftaffic shoursce 

### Clicks overall

In [None]:
import pandas as pd

# Load the dataset

# Data cleaning
#cp['first_touchpoint'] = pd.to_datetime(df['first_touchpoint'])
#cp['conversion_timestamp'] = pd.to_datetime(df['conversion_timestamp'])

# Count the occurrences of each traffic source across all steps
traffic_source_counts = exploded_df['medium_steps'].value_counts()

print(traffic_source_counts)

# Calculate the total number of steps
total_steps = traffic_source_counts.sum()

# Calculate the percentage of each traffic source
traffic_source_percentages = (traffic_source_counts / total_steps) * 100

# Display the traffic sources and their percentages
print(traffic_source_percentages)

### paidsocial analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
#data = r'conversion_paths.csv'
#cp = pd.read_csv(data)

# Data cleaning
cp['first_touchpoint'] = pd.to_datetime(cp['first_touchpoint'])
cp['conversion_timestamp'] = pd.to_datetime(cp['conversion_timestamp'])

# Filter out rows with null medium_path
data_filtered = cp[cp['medium_path'].notnull()]
data_filtered['medium_steps'] = data_filtered['medium_path'].str.split(',')

# Ensure all steps are exploded into separate rows
df = data_filtered.explode('medium_steps')

# Analyze "paidsocial" channel
paidsocial_data = df[df['medium_steps'] == 'paidsocial']

# Frequency of "paidsocial" in different steps
step_counts = paidsocial_data.groupby('user_pseudo_id').cumcount() + 1
step_frequency = step_counts.value_counts().sort_index()
print("Frequency of 'paidsocial' in different steps:\n", step_frequency)

# Conversion rate when "paidsocial" is involved
paidsocial_users = data_filtered[data_filtered['medium_steps'].apply(lambda steps: 'paidsocial' in steps if isinstance(steps, list) else False)]
paidsocial_conversion_rate = paidsocial_users['conversion_flag'].mean() * 100
print(f"Conversion rate for paths involving 'paidsocial': {paidsocial_conversion_rate:.2f}%")

# Common paths involving "paidsocial"
common_paths_with_paidsocial = paidsocial_users['medium_path'].value_counts().head(10)
print("Common paths involving 'paidsocial':\n", common_paths_with_paidsocial)

# Time to conversion with "paidsocial" involvement
paidsocial_conversions = paidsocial_users[paidsocial_users['conversion_flag'] == 1]
paidsocial_conversions['time_to_conversion'] = paidsocial_conversions['conversion_timestamp'] - paidsocial_conversions['first_touchpoint']
time_to_conversion_stats = paidsocial_conversions['time_to_conversion'].describe()
print("Time to Conversion Stats for 'paidsocial' involvement:\n", time_to_conversion_stats)

# Plot the frequency of 'paidsocial' in different steps
plt.figure(figsize=(10, 6))
step_frequency.plot(kind='bar')
plt.title("Frequency of 'paidsocial' in Different Steps")
plt.xlabel("Step Position")
plt.ylabel("Frequency")
plt.show()

# Plot common paths involving 'paidsocial'
plt.figure(figsize=(10, 6))
common_paths_with_paidsocial.plot(kind='bar')
plt.title("Common Paths Involving 'paidsocial'")
plt.xlabel("Paths")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
#data = r'conversion_paths.csv'
#cp = pd.read_csv(data)

# Data cleaning
#cp['first_touchpoint'] = pd.to_datetime(cp['first_touchpoint'])
#cp['conversion_timestamp'] = pd.to_datetime(cp['conversion_timestamp'])

# Filter out rows with null medium_path
data_filtered = cp[cp['medium_path'].notnull()]
data_filtered['medium_steps'] = data_filtered['medium_path'].str.split(',')

# Ensure all steps are exploded into separate rows
df = data_filtered.explode('medium_steps')

# Analyze "paidsocial" channel
paidsocial_data = df[df['medium_steps'] == 'paidsocial']

# Frequency of "paidsocial" in different steps
step_counts = paidsocial_data.groupby('user_pseudo_id').cumcount() + 1
step_frequency = step_counts.value_counts().sort_index()
print("Frequency of 'paidsocial' in different steps:\n", step_frequency)

# Conversion rate when "paidsocial" is involved
paidsocial_users = data_filtered[data_filtered['medium_steps'].apply(lambda steps: 'paidsocial' in steps if isinstance(steps, list) else False)]
paidsocial_conversion_rate = paidsocial_users['conversion_flag'].mean() * 100
print(f"Conversion rate for paths involving 'paidsocial': {paidsocial_conversion_rate:.2f}%")

# Common paths involving "paidsocial"
common_paths_with_paidsocial = paidsocial_users['medium_path'].value_counts().head(10)
print("Common paths involving 'paidsocial':\n", common_paths_with_paidsocial)

# Conversions for common paths involving 'paidsocial'
conversion_counts = paidsocial_users.groupby('medium_path')['conversion_flag'].sum().loc[common_paths_with_paidsocial.index]
conversion_rates = (conversion_counts / common_paths_with_paidsocial) * 100

# Create a DataFrame for common paths with conversions and conversion rates
common_paths_analysis = pd.DataFrame({
    'Path': common_paths_with_paidsocial.index,
    'Total Occurrences': common_paths_with_paidsocial.values,
    'Conversions': conversion_counts.values,
    'Conversion Rate (%)': conversion_rates.values
})

print("Common paths involving 'paidsocial' with conversions and conversion rates:\n", common_paths_analysis)

# Time to conversion with "paidsocial" involvement
#paidsocial_conversions = paidsocial_users[paidsocial_users['conversion_flag'] == 1]
#paidsocial_conversions['time_to_conversion'] = paidsocial_conversions['conversion_timestamp'] - paidsocial_conversions['first_touchpoint']
#time_to_conversion_stats = paidsocial_conversions['time_to_conversion'].describe()
#print("Time to Conversion Stats for 'paidsocial' involvement:\n", time_to_conversion_stats)

# Plot the frequency of 'paidsocial' in different steps
plt.figure(figsize=(10, 6))
step_frequency.plot(kind='bar')
plt.title("Frequency of 'paidsocial' in Different Steps")
plt.xlabel("Step Position")
plt.ylabel("Frequency")
plt.show()

# Plot common paths involving 'paidsocial'
plt.figure(figsize=(10, 6))
common_paths_analysis.set_index('Path')['Total Occurrences'].plot(kind='bar')
plt.title("Common Paths Involving 'paidsocial'")
plt.xlabel("Paths")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

# Plot conversion rates for common paths involving 'paidsocial'
plt.figure(figsize=(10, 6))
common_paths_analysis.set_index('Path')['Conversion Rate (%)'].plot(kind='bar')
plt.title("Conversion Rates for Common Paths Involving 'paidsocial'")
plt.xlabel("Paths")
plt.ylabel("Conversion Rate (%)")
plt.xticks(rotation=45)
plt.show()


Key Insights:
Low Overall Conversion Rate:

The conversion rate for the primary "paidsocial" path is very low (0.38%). This suggests that the majority of traffic from "paidsocial" is not leading to conversions.
Repetition Without Results:

Paths with multiple "paidsocial" interactions (e.g., "paidsocial, paidsocial", "paidsocial, paidsocial, paidsocial") also show low conversion rates. This indicates that repeated exposure through "paidsocial" does not significantly increase the likelihood of conversion.
Higher Conversion Rates in Specific Combinations:

Certain combinations, such as "paidsocial, referral" and "paidsocial, email", have higher conversion rates (2.47% and 10%, respectively). This suggests that "paidsocial" might be more effective when followed by other channels.
Recommendations:
Optimize Targeting and Ad Content:

Review the targeting criteria for "paidsocial" campaigns. Ensure that the ads are reaching the right audience. Adjust the ad content to be more compelling and relevant to the target audience.
Reduce Over-Reliance on "paidsocial" Alone:

Since repeated "paidsocial" interactions alone do not lead to higher conversions, consider diversifying the marketing strategy. Integrate "paidsocial" with other channels like "referral" and "email" which have shown better conversion rates in combination.
Leverage Retargeting:

Implement retargeting strategies where users who interact with "paidsocial" ads but do not convert are targeted with follow-up ads or emails. This can help to reinforce the message and potentially increase conversion rates.
Analyze User Journey and Content Engagement:

Perform a deeper analysis of user behavior on the landing pages. Check the bounce rates, time spent on the page, and user interaction. Identify and fix potential issues on the landing pages that could be causing drop-offs.
A/B Testing:

Conduct A/B tests to try different ad creatives, landing pages, and call-to-action (CTA) messages. Identify which variations perform better and iterate based on the results.
Follow-Up Campaigns:

For paths that show higher conversion rates (e.g., "paidsocial, email"), develop follow-up campaigns specifically targeting users who initially came through "paidsocial". This could involve sending personalized emails or offers to re-engage these users.
Monitor and Adjust Budget Allocation:

Given the low conversion rate of "paidsocial" when used alone, consider reallocating some of the budget to other channels or combinations that have proven to be more effective. Continuously monitor performance and adjust the budget allocation accordingly.
Improving Attribution and Tracking:

Ensure that tracking mechanisms are accurately capturing the entire user journey. Sometimes, attribution errors can lead to misinterpretation of which channels are contributing to conversions. Implementing advanced attribution models can help in understanding the true impact of each channel.
Conclusion:
The current performance of "paidsocial" campaigns indicates that while they are generating clicks, they are not effectively converting users. By optimizing targeting, content, and integrating "paidsocial" with other channels, the marketing team can potentially improve the conversion rates. Continuous testing, analysis, and optimization will be crucial in enhancing the effectiveness of "paidsocial" campaigns.

## common conversion path

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [None]:
# Identify the most common conversion paths
common_conversion_paths = df[df['conversion_flag'] == 1]['medium_path'].value_counts().head(20)
#print("Common Conversion Paths:\n", common_conversion_paths)
common_conversion_paths.head(14)

time to conversion

In [None]:
data_filtered['first_touchpoint'] = pd.to_datetime(data_filtered['first_touchpoint'])
data_filtered['conversion_timestamp'] = pd.to_datetime(data_filtered['conversion_timestamp'])

# Calculate the time to conversion
df['time_to_conversion'] = data_filtered['conversion_timestamp'] - data_filtered['first_touchpoint']

# Filter for converted users and get descriptive stats
time_to_conversion_stats = df[df['conversion_flag'] == 1]['time_to_conversion'].describe()
print("Time to Conversion Stats:\n", time_to_conversion_stats)


push users with email 

think pater name contrast efect

## Summury

1) most conversion are done after interaction only with one channel
orgain, cpc, email works the best and bring most of conversions
2) diff types of attribution also confirmed that orgain, cpc, email have highst conversion rate. And the most converted users are from 'gotoweb' (users that come from application). With this we can use users that use aplication as main audiense and also for look a like audience for another paid campains (cpc, email ..) 
3) channel that parform the lowest is 'paidsocial', it have the hights click rate but the lowest conversion. For this channel we need to pay most attaintion and provide optimisation. The current performance of "paidsocial" campaigns indicates that while they are generating clicks, they are not effectively converting users. By optimizing targeting, content, and integrating "paidsocial" with other channels, the marketing team can potentially improve the conversion rates. Continuous testing, analysis, and optimization will be crucial in enhancing the effectiveness of "paidsocial" campaigns.
4) in avarage 4 days takes user to make conversion, so we can use this time for our emai campains, to send new offers with discaunts or special proposal to convince user make a conversion + retargeting