In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import logging

logging.basicConfig(filename='log.log', filemode='a', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
pd.set_option('display.max_columns', None)


In [None]:
def popZeroID(row):
    x = row['TitleID'].split(".")
    return(x[0])

def popZeroDate(row):
    x = row['YearMonth'].split(".")
    return(x[0])

def splitYear(row):
    year = row['YearMonth'][:4]
    month = row['YearMonth'][4:6]
    monthyear = '{}/{}'.format(month, year)
    return(monthyear)


In [None]:

# Read in new Print and ER sales
PRH_new = pd.read_excel(open('data_update/{}.xlsx'.format(new_pe_data), 'rb'),
              sheet_name='PRH_Jan-Feb') 

# Read in new EL sales
thirdPartyTransacations_new = pd.read_excel(open('data_update/{}'.format(new_el_file), 'rb'),
              sheet_name='3rdPartyEbookUpdates') 


In [None]:
# # Convert Title ID and ISBN to string for existing data
existing_data['TitleID'] = existing_data['TitleID'].astype(str)
existing_data['ISBN'] = existing_data['ISBN'].astype(str)


In [None]:
# Combine all new sales into a single DF
new_data = pd.concat([PRH_new, thirdPartyTransacations_new])

# We need to remove Gratis copies from the sales data
new_data_clean = (new_data[ 'revenuetype' ] != 'Gratis copies')
new_data = new_data[new_data_clean]

# Convert date into date format
# Drop any empty rows, convert 'TitleID' and 'PRINT isbn' to a string.
# Group everything by 'ISBN', 'TitleID', and 'Date (month)'
# Rename Print isbn as just 'isbn', which we need to do in order to merge print and electronic sales
new_data.rename(columns={'date': 'Date', 'unitssold': 'Units Sold', 'saleprice': 'Dom. Sale Price', 'actualamount': 'Amount Sold', 'isbn': 'ISBN'}, inplace=True)

new_data['Date'] = pd.to_datetime(new_data.Date)
new_data['Date'] = new_data['Date'].dt.strftime('%m/%Y')

new_data.dropna(how="all", inplace=True)
new_data['TitleID'] = new_data['TitleID'].astype(str)
new_data['ISBN'] = new_data['ISBN'].astype(str)



In [None]:
# Divide up print and e sales
print_sales = new_data[new_data['revenuetype'] != 'Electronic']
print_sales = print_sales.groupby(['TitleID', 'ISBN', 'Date']).sum()
print_sales.reset_index(inplace=True)
print_sales['Format'] = 'Print'
print_sales

er_sales = new_data[new_data['revenuetype'] == 'Electronic']
er_sales = er_sales.groupby(['TitleID', 'ISBN', 'Date']).sum()
er_sales.reset_index(inplace=True)
er_sales['TitleID'] = er_sales.apply(popZeroID, axis=1)
er_sales['Format'] = 'Electronic'

In [None]:
# Merge print and electronic sales
merged_sales = pd.concat([existing_data,er_sales,print_sales])
merged_sales = merged_sales[['TitleID', 'ISBN', 'Date', 'Units Sold', 'Amount Sold', 'Format']]

In [None]:
merged_sales.to_csv('data_update/merged_sales_update.csv', index=None)

In [None]:
# Read usage. Drop rows missing 'Container_OnlineIdentifier'. Convert 'Container_OnlineIdentifier' to string.
# Rename 'Container_OnlineIdentifier' as 'isbn'
# The file you should download from PowerBI is 'MITPB_All_BOOKS_DATA_REPORT'
# You have to download two versions: one with chapter downloads and one with monthly downloads. Trying to export all at once is too large. 

book_downloads = pd.read_excel(open('data_update/all_direct_book_downloads.xlsx', 'rb'),
              sheet_name='Export') 
book_downloads.dropna(subset=['Container_OnlineIdentifier'], inplace=True)
book_downloads['Container_OnlineIdentifier'] = book_downloads['Container_OnlineIdentifier'].astype(str)
book_downloads.rename(columns={'Container_ExternalIdentifier': 'TitleID', 'Container_OnlineIdentifier': 'isbn'}, inplace=True)

# Convert 'YearMonth' to string, run 'YearMonth' through popZeroDate() to remove decmial and add 'Date_Formatted'
# Change 'Date_Formatted' colume into mm/YY (i.e. 04/2019)
book_downloads['TitleID'] = book_downloads['TitleID'].astype(str)
book_downloads['YearMonth'] = book_downloads['YearMonth'].astype(str)
book_downloads['isbn'] = book_downloads.apply(popZeroISBN, axis=1)
book_downloads['TitleID'] = book_downloads.apply(popZeroID, axis=1)
book_downloads['date'] = book_downloads.apply(splitYear, axis=1)
book_downloads['date'] = pd.to_datetime(book_downloads['date'])
book_downloads['date'] = book_downloads['date'].dt.strftime('%m/%Y')

# Rename tnhecolumnsto match sales. Reformat TitleID. Add 'Format' column and populateitwith'Downloads' 
book_downloads['Format'] = 'Downloads'
book_downloads.to_csv('data_output/book_downloads.csv', index=None)
book_downloads


In [None]:

chapter_downloads = pd.read_excel(open('data_update/all_direct_chapter_downloads.xlsx', 'rb'),
              sheet_name='Export') 
chapter_downloads.dropna(subset=['Container_OnlineIdentifier'], inplace=True)
chapter_downloads['Container_OnlineIdentifier'] = chapter_downloads['Container_OnlineIdentifier'].astype(str)
chapter_downloads.rename(columns={'Container_ExternalIdentifier': 'TitleID', 'Container_OnlineIdentifier': 'isbn'}, inplace=True)

# Convert 'YearMonth' to string, run 'YearMonth' through popZeroDate() to remove decmial and add 'Date_Formatted'
# Change 'Date_Formatted' colume into mm/YY (i.e. 04/2019)
chapter_downloads['TitleID'] = chapter_downloads['TitleID'].astype(str)
chapter_downloads['isbn'] = chapter_downloads.apply(popZeroISBN, axis=1)
chapter_downloads['TitleID'] = chapter_downloads.apply(popZeroID, axis=1)


# Rename tnhecolumnsto match sales. Reformat TitleID. Add 'Format' column and populateitwith'Downloads' 
chapter_downloads['Format'] = 'Downloads'
chapter_downloads.to_csv('data_output/chapter_downloads.csv', index=None)
chapter_downloads

chapter_downloads.to_csv('data_output/chapter_downloads.csv', index=None)
