In [1]:
import pandas as pd
from datetime import datetime

#Assign each file a variable name
file1 = 'GA_RevTranSessUser.csv'
file2 = 'GA_ChannelRevUsersTrans.csv'
file3 = 'GA_DeviceRevUsersTrans.csv'
file4 = 'DailyPlan.csv'
file5 = 'Media Spend.csv'

#Import each file
ga1_df = pd.read_csv(file1)
ga2_df = pd.read_csv(file2)
ga3_df = pd.read_csv(file3)
dailyplan_df = pd.read_csv(file4)
mediaspend_df = pd.read_csv(file5)

#Format the Date column for each file into the correct Date format
ga1_df['Date'] = pd.to_datetime(ga1_df['Date'], format='%Y%m%d')
ga2_df['Date'] = pd.to_datetime(ga2_df['Date'], format='%Y%m%d')
ga3_df['Date'] = pd.to_datetime(ga3_df['Date'], format='%Y%m%d')
dailyplan_df['Date'] = pd.to_datetime(dailyplan_df['Date'], format='%m/%d/%Y')
mediaspend_df['Date'] = pd.to_datetime(mediaspend_df['Date'], format='%m/%d/%Y')

#Check the date field format for each
print(ga1_df.info())
print(ga2_df.info())
print(ga3_df.info())
print(dailyplan_df.info())
print(mediaspend_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359 entries, 0 to 358
Data columns (total 10 columns):
Date                     359 non-null datetime64[ns]
Revenue                  359 non-null float64
Users                    359 non-null int64
Sessions                 359 non-null int64
Transactions             359 non-null int64
Quantity                 359 non-null int64
Bounces                  359 non-null int64
Pages / Session          359 non-null float64
Avg. Session Duration    359 non-null object
Session Duration         359 non-null object
dtypes: datetime64[ns](1), float64(2), int64(5), object(2)
memory usage: 28.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4062 entries, 0 to 4061
Data columns (total 5 columns):
Date                        4062 non-null datetime64[ns]
Default Channel Grouping    4062 non-null object
Users                       4062 non-null float64
Revenue                     4062 non-null float64
Transactions                4062 non-null

In [2]:
#ga2 included rows for both "Affiliates" and "Affiliate". Update occurances of "Affiliates" to "Affiliate"
ga2_df = ga2_df.replace({'Affiliates':'Affiliate'})

#ga2 now has multiple rows for the same date for Affiliate. Sum Users, Revenue & Transactions when two rows occur
ga2_df = ga2_df.groupby(['Date','Default Channel Grouping'])['Users', 'Revenue', 'Transactions'].sum().reset_index()

#Pivot the ga2, ga3 & media spend files so that there is a single row for each date
ga2_df = ga2_df.pivot(index='Date', columns=('Default Channel Grouping'))
ga3_df = ga3_df.pivot(index='Date', columns=('Device Category'))
mediaspend_df = mediaspend_df.pivot(index='Date', columns=('Channel'))
print(ga2_df.head())
print(ga3_df.head())
print(mediaspend_df.head())


                           Users                                               \
Default Channel Grouping (Other) Affiliate     CSE   Direct  Display    Email   
Date                                                                            
2016-12-04                  71.0   11819.0  1277.0  33488.0   6123.0  91991.0   
2016-12-05                 704.0   10391.0  1376.0  31868.0   4457.0  36808.0   
2016-12-06                 444.0   10372.0   891.0  30780.0   4461.0  98907.0   
2016-12-07                 144.0   23522.0  1132.0  30330.0   5299.0  55409.0   
2016-12-08                  77.0   18435.0  1635.0  31412.0  26715.0  99078.0   

                                                                    \
Default Channel Grouping Organic Search Organic Social Paid Search   
Date                                                                 
2016-12-04                      78178.0         2557.0     62804.0   
2016-12-05                      71858.0         3150.0     59878.0   
2

In [3]:
print(dailyplan_df.head())

        Date                                           TY Event  \
0 2016-12-04               sm natalya DOD/emp app/sos nyc hw ca   
1 2016-12-05        quiet dod.employee appreciation 12/1 - 12/5   
2 2016-12-06                                                NaN   
3 2016-12-07  sos leak Fragrance GWP: Free tote bag with FP ...   
4 2016-12-08  Fragrance GWP: Free tote bag with FP fragrance...   

                                            TY Email Promo or Non Promo Day  \
0                           9 AM: DOTD small natalya                    NaN   
1                                                NaN                    NaN   
2  9 AM: new arrivals + healy lane lilith footer ...                    NaN   
3           9 AM: #missadventure + miss piggy footer                    NaN   
4    9 AM: gifts $100 + under + GWP fragrance footer                    NaN   

  LY Event LY Email  Daily Revenue Demand Plan  Daily Traffic Plan  \
0      NaN      NaN                  1830000.0      

In [4]:
#Create a new column with either 1 or 0, 1 if there was an email that day, 0 if no email
dailyplan_df['Email Day'] = dailyplan_df['TY Email'].notnull().mul(1)
print(dailyplan_df.head())

        Date                                           TY Event  \
0 2016-12-04               sm natalya DOD/emp app/sos nyc hw ca   
1 2016-12-05        quiet dod.employee appreciation 12/1 - 12/5   
2 2016-12-06                                                NaN   
3 2016-12-07  sos leak Fragrance GWP: Free tote bag with FP ...   
4 2016-12-08  Fragrance GWP: Free tote bag with FP fragrance...   

                                            TY Email Promo or Non Promo Day  \
0                           9 AM: DOTD small natalya                    NaN   
1                                                NaN                    NaN   
2  9 AM: new arrivals + healy lane lilith footer ...                    NaN   
3           9 AM: #missadventure + miss piggy footer                    NaN   
4    9 AM: gifts $100 + under + GWP fragrance footer                    NaN   

  LY Event LY Email  Daily Revenue Demand Plan  Daily Traffic Plan  \
0      NaN      NaN                  1830000.0      

In [55]:
#Create a new column for promo type and extract the type of sale from TY Event
dailyplan_df['Promo Type'] = dailyplan_df['TY Event'].str.extract('(sale on sale|Sale on Sale|SOS|sos|markdowns|Markdowns|SS|Surprise Sale|surprise sale|ss|GWP|DOTD|DOD|New Arrivals|new arrivals|Friends & Family|friends & family|SWP|Employee Appreciation|employee appreciation)')

#Create a new column for each promo type, put 1 if occured 0 if not
dailyplan_df['SS'] = [1 if ele == 'surprise sale' or ele == 'ss' or ele == 'Surprise Sale' or ele == 'SS' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['MD'] = [1 if ele == 'Markdowns' or ele == 'markdowns' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['GWP'] = [1 if ele == 'GWP' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['FF'] = [1 if ele == 'Friends & Family' or ele == 'friends & family' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['DOTD'] = [1 if ele == 'DOD' or ele == 'DOTD' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['NA'] = [1 if ele == 'New Arrivals' or ele == 'new arrivals' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['SOS'] = [1 if ele == 'Sale on Sale' or ele == 'sale on sale'or ele == 'SOS' or ele == 'sos' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['SWP'] = [1 if ele == 'SWP' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['EA'] = [1 if ele == 'Employee Appreciation' or ele == 'employee appreciation' else 0 for ele in dailyplan_df['Promo Type']]

print(dailyplan_df.head())

        Date                                           TY Event  \
0 2016-12-04               sm natalya DOD/emp app/sos nyc hw ca   
1 2016-12-05        quiet dod.employee appreciation 12/1 - 12/5   
2 2016-12-06                                                NaN   
3 2016-12-07  sos leak Fragrance GWP: Free tote bag with FP ...   
4 2016-12-08  Fragrance GWP: Free tote bag with FP fragrance...   

                                            TY Email Promo or Non Promo Day  \
0                           9 AM: DOTD small natalya                    NaN   
1                                                NaN                    NaN   
2  9 AM: new arrivals + healy lane lilith footer ...                    NaN   
3           9 AM: #missadventure + miss piggy footer                    NaN   
4    9 AM: gifts $100 + under + GWP fragrance footer                    NaN   

  LY Event LY Email  Daily Revenue Demand Plan  Daily Traffic Plan  \
0      NaN      NaN                  1830000.0      

  


In [71]:
#Create 1 DataFrame for all 5 files
ga1_ga2_joined_df = ga1_df.join(ga2_df, how='inner', on='Date')
ga_all_joined_df = ga1_ga2_joined_df.join(ga3_df, how='inner', on='Date')
plan_spend_df = dailyplan_df.join(mediaspend_df, how='inner', on='Date')
all_df = ga_all_joined_df.merge(plan_spend_df, how='inner', on='Date')
print(all_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359 entries, 0 to 358
Columns: 103 entries, Date to (Spend, Referral)
dtypes: datetime64[ns](1), float64(70), int32(1), int64(20), object(11)
memory usage: 290.3+ KB
None


In [73]:
all_df.to_csv('all_df.csv')