In [1]:
import pandas as pd
from datetime import datetime

#Assign each file a variable name
file1 = 'GA_RevTranSessUser.csv'
file2 = 'GA_ChannelRevUsersTrans.csv'
file3 = 'GA_DeviceRevUsersTrans.csv'
file4 = 'DailyPlan.csv'
file5 = 'Media Spend.csv'

#Import each file
ga1_df = pd.read_csv(file1)
ga2_df = pd.read_csv(file2)
ga3_df = pd.read_csv(file3)
dailyplan_df = pd.read_csv(file4)
mediaspend_df = pd.read_csv(file5)

#Format the Date column for each file into the correct Date format
ga1_df['Date'] = pd.to_datetime(ga1_df['Date'], format='%Y%m%d')
ga2_df['Date'] = pd.to_datetime(ga2_df['Date'], format='%Y%m%d')
ga3_df['Date'] = pd.to_datetime(ga3_df['Date'], format='%Y%m%d')
dailyplan_df['Date'] = pd.to_datetime(dailyplan_df['Date'], format='%m/%d/%Y')
mediaspend_df['Date'] = pd.to_datetime(mediaspend_df['Date'], format='%m/%d/%Y')

#Check the date field format for each
print(ga1_df.info())
print(ga2_df.info())
print(ga3_df.info())
print(dailyplan_df.info())
print(mediaspend_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359 entries, 0 to 358
Data columns (total 10 columns):
Date                     359 non-null datetime64[ns]
Revenue                  359 non-null float64
Users                    359 non-null int64
Sessions                 359 non-null int64
Transactions             359 non-null int64
Quantity                 359 non-null int64
Bounces                  359 non-null int64
Pages / Session          359 non-null float64
Avg. Session Duration    359 non-null object
Session Duration         359 non-null object
dtypes: datetime64[ns](1), float64(2), int64(5), object(2)
memory usage: 28.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4062 entries, 0 to 4061
Data columns (total 5 columns):
Date                        4062 non-null datetime64[ns]
Default Channel Grouping    4062 non-null object
Users                       4062 non-null float64
Revenue                     4062 non-null float64
Transactions                4062 non-null

In [2]:
#ga2 included rows for both "Affiliates" and "Affiliate". Update occurances of "Affiliates" to "Affiliate"
ga2_df = ga2_df.replace({'Affiliates':'Affiliate'})

#ga2 now has multiple rows for the same date for Affiliate. Sum Users, Revenue & Transactions when two rows occur
ga2_df = ga2_df.groupby(['Date','Default Channel Grouping'])['Users', 'Revenue', 'Transactions'].sum().reset_index()

#Pivot the ga2, ga3 & media spend files so that there is a single row for each date
ga2_df = ga2_df.pivot(index='Date', columns=('Default Channel Grouping'))
ga3_df = ga3_df.pivot(index='Date', columns=('Device Category'))
mediaspend_df = mediaspend_df.pivot(index='Date', columns=('Channel'))
print(ga2_df.info())
print(ga3_df.info())
print(mediaspend_df.info())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 359 entries, 2016-12-04 to 2017-11-27
Data columns (total 39 columns):
(Users, (Other))                  359 non-null float64
(Users, Affiliate)                359 non-null float64
(Users, CSE)                      340 non-null float64
(Users, Direct)                   359 non-null float64
(Users, Display)                  359 non-null float64
(Users, Email)                    359 non-null float64
(Users, Organic Search)           359 non-null float64
(Users, Organic Social)           359 non-null float64
(Users, Paid Search)              359 non-null float64
(Users, Paid Social)              359 non-null float64
(Users, Partnerships)             108 non-null float64
(Users, Referral)                 359 non-null float64
(Users, Social)                   1 non-null float64
(Revenue, (Other))                359 non-null float64
(Revenue, Affiliate)              359 non-null float64
(Revenue, CSE)                    340 non-null float6

In [3]:
print(dailyplan_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 21 columns):
Date                                               392 non-null datetime64[ns]
TY Event                                           267 non-null object
TY Email                                           257 non-null object
Promo or Non Promo Day                             35 non-null object
LY Event                                           217 non-null object
LY Email                                           256 non-null object
Daily Revenue Demand Plan                          392 non-null float64
Daily Traffic Plan                                 392 non-null float64
Daily Net Shipped Revenue Plan                     392 non-null float64
Annual Demand Forecast (Yearly Rolling)            29 non-null float64
Annual Net Shipped Revenue Plan                    28 non-null float64
Orders Plan                                        392 non-null object
Units Plan                       

In [4]:
#Create a new column with either 1 or 0, 1 if there was an email that day, 0 if no email
dailyplan_df['Email Day'] = dailyplan_df['TY Email'].notnull().mul(1)
print(dailyplan_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 22 columns):
Date                                               392 non-null datetime64[ns]
TY Event                                           267 non-null object
TY Email                                           257 non-null object
Promo or Non Promo Day                             35 non-null object
LY Event                                           217 non-null object
LY Email                                           256 non-null object
Daily Revenue Demand Plan                          392 non-null float64
Daily Traffic Plan                                 392 non-null float64
Daily Net Shipped Revenue Plan                     392 non-null float64
Annual Demand Forecast (Yearly Rolling)            29 non-null float64
Annual Net Shipped Revenue Plan                    28 non-null float64
Orders Plan                                        392 non-null object
Units Plan                       

In [5]:
#Create a new column for promo type and extract the type of sale from TY Event
dailyplan_df['Promo Type'] = dailyplan_df['TY Event'].str.extract('(sale on sale|Sale on Sale|SOS|sos|markdowns|Markdowns|SS|Surprise Sale|surprise sale|ss|GWP|DOTD|DOD|New Arrivals|new arrivals|Friends & Family|friends & family|SWP|Employee Appreciation|employee appreciation)')

#Create a new column for each promo type, put 1 if occured 0 if not
dailyplan_df['SS'] = [1 if ele == 'surprise sale' or ele == 'ss' or ele == 'Surprise Sale' or ele == 'SS' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['MD'] = [1 if ele == 'Markdowns' or ele == 'markdowns' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['GWP'] = [1 if ele == 'GWP' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['FF'] = [1 if ele == 'Friends & Family' or ele == 'friends & family' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['DOTD'] = [1 if ele == 'DOD' or ele == 'DOTD' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['NA'] = [1 if ele == 'New Arrivals' or ele == 'new arrivals' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['SOS'] = [1 if ele == 'Sale on Sale' or ele == 'sale on sale'or ele == 'SOS' or ele == 'sos' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['SWP'] = [1 if ele == 'SWP' else 0 for ele in dailyplan_df['Promo Type']]
dailyplan_df['EA'] = [1 if ele == 'Employee Appreciation' or ele == 'employee appreciation' else 0 for ele in dailyplan_df['Promo Type']]

print(dailyplan_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 32 columns):
Date                                               392 non-null datetime64[ns]
TY Event                                           267 non-null object
TY Email                                           257 non-null object
Promo or Non Promo Day                             35 non-null object
LY Event                                           217 non-null object
LY Email                                           256 non-null object
Daily Revenue Demand Plan                          392 non-null float64
Daily Traffic Plan                                 392 non-null float64
Daily Net Shipped Revenue Plan                     392 non-null float64
Annual Demand Forecast (Yearly Rolling)            29 non-null float64
Annual Net Shipped Revenue Plan                    28 non-null float64
Orders Plan                                        392 non-null object
Units Plan                       

  


In [6]:
#Create 1 DataFrame for all 5 files
ga1_ga2_joined_df = ga1_df.join(ga2_df, how='inner', on='Date')
ga_all_joined_df = ga1_ga2_joined_df.join(ga3_df, how='inner', on='Date')
plan_spend_df = dailyplan_df.join(mediaspend_df, how='inner', on='Date')
all_df = ga_all_joined_df.merge(plan_spend_df, how='inner', on='Date')
print(all_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359 entries, 0 to 358
Columns: 102 entries, Date to (Spend, Referral)
dtypes: datetime64[ns](1), float64(70), int32(1), int64(20), object(10)
memory usage: 287.5+ KB
None




In [7]:
all_df.to_csv('all_df.csv')