# Extract Daily Oracle Emails and Copy to Spreadsheet

Daily Oracle emails contain 2 types of tables for panels 2, 5, 7:
    1. The first is a current summary of the panels. 
    2. The second is a history table for each panel. Only the lastest history table is needed
    
The emails are extracted, parsed and saved into a consolidated spreadsheet

### Global Parameters

In [34]:
# Spreadsheet settings
spreadsheet_directory = 'C:/Users/thompja/OneDrive - Sky/106 Panel Expansion'
spreadsheet_to_open = 'Oracle Panel Data From Email.xlsx'
sheet_to_open = 'Oracle Report Data'

# Output spreadsheet rows and cols used to store the email tables
table_output_col_by_table = [1, 10] # start cols for each table
spreadsheet_first_data_row = 5 # First row with data in spreadsheet. Same for each table
table_2_num_cols = 8 # History table has this many cols
table_date_col = [1, 2] # For each table there is one col with a date. 

# Email folder and subfolder that contains the emails to process
inbox_index = 6
subfldr_index = 8

# Message Settings
num_tables_to_process = 2 # the first x tables in the messge to process

# When searching through the messge the row above the start and below the end of data can be found with these settings
start_table_pos = 25
start_table_char = '-'
end_table_pos = 25
end_table_char = '-'




#### Table Schemas

Need to define a schema to translate the tables in the email to columns. 
Email has fixed length tables:

In [35]:
# Table 1: Snapshot - amend as required

schema_snapshot = [['dated', 0, 9]]
schema_snapshot.append(['panel', 10, 20])
schema_snapshot.append(['panel_subs', 21, 31])
schema_snapshot.append(['unique_stb', 32, 42])
schema_snapshot.append(['pct_success', 43, 54])


# Table 2: History - amend as required
schema_history = [['panel', 0, 10]]
schema_history.append(['on_panel', 11, 21])
schema_history.append(['new_boxes', 22, 32])
schema_history.append(['connected', 33, 43])
schema_history.append(['not_connected', 44, 57])
schema_history.append(['last_day', 58, 68])


# Combine the individual table schemas into a single list which can be looped through
table_schemas = [schema_snapshot]
table_schemas.append(schema_history)



### Import Packages

In [36]:
import win32com.client
import openpyxl
import numpy as np
import datetime
import time
import calendar
import os

### Define Functions

In [37]:

def most_recent_date_in_spreadsheet(ws, search_row, search_col):
    # search through the search_col on the sheet ws for the most recent date
    # search_col must have dates
    # return the row of the last non-empty cell and the most recent date
    recent_date = calendar.timegm(time.strptime('10/11/17', "%d/%m/%y")) # default 1900 date        

    while True:
        if ws.cell(row=search_row, column=search_col).value != None:        
            date_string = ws.cell(row=search_row, column=search_col).value
            excel_date = calendar.timegm(time.strptime(date_string, "%d/%m/%y"))
            if excel_date > recent_date:
                recent_date = excel_date
            search_row += 1 # increment to next row in spreadsheet
        else:
            break
                    
    return (search_row, recent_date)

    
def open_email(inbox_index, subfldr_index):
    # return all messages in subfldr_index which is a subfolder of inbox_index
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

    inbox = outlook.GetDefaultFolder(inbox_index) 

    subfldr = inbox.Folders[subfldr_index] 
    messages = subfldr.Items

    return messages



def format_one_line_of_message(row_string, table_number, schema):
    # process a single line of message
    # row_string: Single line of the message to process
    # schema: List of lists with how to split the string into columns: 1st col is name of header; 
    #                              2nd col is start col; 3rd col is end col. Row for each col in table         

    number_of_cols_in_schema = len(schema) # rows in schema represent cols in the output table
    chars_in_schema = schema[number_of_cols_in_schema-1][2]                     

    if len(row_string) >= chars_in_schema:
        # message string is at least as long as the schema requires so continue
        row_append = [table_number] 

        # loop through each schema col
        for col in range(number_of_cols_in_schema):
            append_string = row_string[schema[col][1] : schema[col][2]]
            row_append.append(append_string)

    return row_append



def delete_data_from_cells (ws, start_row, start_col, num_cols):    
    # set cells values to None on worksheet ws - None does not work so set to ''
    # start with row and start_col
    # set the next num_cols to none
    # move onto the next row and if start_col is not none then repeat
    
    row = start_row
    
    while True:
        if ws.cell(row=row, column=start_col).value == None or ws.cell(row=row, column=start_col).value == '':
            break
        else:
            for col in range(num_cols+1):
                ws.cell(row=row, column=start_col + col, value='') 
               
        row += 1
    
    # return the start row of the table
    return start_row
    

   

### Function to Loop Through a Message

In [38]:
def loop_through_message(message_split, num_tables_to_process, table_schemas, start_table_pos, start_table_char, end_table_pos, end_table_char):

    start_of_table_found = False
    end_of_table = False
    table_count = 0
    processed_message = []
    message_row = 0

    
    # loop through message line by line
    # if not end of the table within the message
    while not end_of_table or table_count<num_tables_to_process: # only first 2 tables in message to process        
        row_string = message_split[message_row]

        if start_of_table_found:
            # Process the tables within the message
            # first check if at end of table
            if row_string[end_table_pos] == end_table_char:
                # signifies the end of table
                end_of_table = True
                start_of_table_found = False # need to find the next table            
            else:            
                # still within table so format row of message
                row_append = format_one_line_of_message(row_string, table_count, schema)                
                processed_message.append(row_append)

        elif len(row_string) > 0: # start of table not found yet so check if this row is the start
            if row_string[start_table_pos] == start_table_char:
                # start of next table found
                start_of_table_found = True 
                end_of_table = False
                table_count += 1 # keep track of how many tables we have processed
                schema = table_schemas[table_count - 1]
                number_of_cols_in_schema = len(schema)
                chars_in_schema = schema[number_of_cols_in_schema-1][2]

        message_row += 1

    return processed_message  


### Copy data to spreadsheet

In [39]:
# Copy to spreadsheet

def copy_tables_to_spreadsheet(ws, spreadsheet_output_row, processed_message, table_output_col_by_table, message_sent_date, table_date_col):
    
    for data_row in processed_message:
        cols = len(data_row)
        table_number = data_row[0]
        date_col = table_date_col[table_number-1]
        spreadsheet_col = table_output_col_by_table[table_number - 1]
        spreadsheet_row = spreadsheet_output_row[table_number - 1]
        ws.cell(row=spreadsheet_row, column=spreadsheet_col, value=message_sent_date)
        message_sent_date_excel = '=datevalue("' + message_sent_date +'")'
        ws.cell(row=spreadsheet_row, column=spreadsheet_col+1, value=message_sent_date_excel)        
        for col in range(1, cols):
            value_for_spreadsheet = data_row[col]
            if col == date_col:                
                # col is a date so just copy it
                ws.cell(row=spreadsheet_row, column=col + spreadsheet_col + 1, value=value_for_spreadsheet)
                # but also need to copy a datevalue form
                date_string_excel = '=datevalue("' + value_for_spreadsheet +'")'
            else:
                try:
                    ws.cell(row=spreadsheet_row, column=col + spreadsheet_col + 1, value=float(value_for_spreadsheet))
                except:
                    ws.cell(row=spreadsheet_row, column=col + spreadsheet_col + 1, value=0)
        
        # last col is the excel datevalue of the date field
        ws.cell(row=spreadsheet_output_row[table_number - 1], column=col + spreadsheet_col + 2, value=date_string_excel)
        
        spreadsheet_output_row[table_number - 1] += 1            
    
    return spreadsheet_output_row 
    

## Main Code

### Open the spreadsheet with the consolidated history
work out:
    1. the most recent email processed 
    2. the last empty row in the spreadsheet where further data can be added
   

In [40]:
# Open the spreadsheet which will be populated with the email data

# Change working directory as files downloaded there
os.chdir(spreadsheet_directory)

# Open the spreadsheet which will be populated with the email data
wb = openpyxl.load_workbook(spreadsheet_to_open)
ws = wb[sheet_to_open]    


# find end of snapshot/history data in spreadsheet and most recent email date

table_snapshot = most_recent_date_in_spreadsheet(ws, 5, 1)
table_snapshot_recent_row = table_snapshot[0]
table_snapshot_recent_date = table_snapshot[1]

table_history = most_recent_date_in_spreadsheet(ws, 5, 9)      
table_history_recent_row = table_history[0]
table_history_recent_date = table_history[1]        

spreadsheet_output_row = [table_snapshot_recent_row, table_history_recent_row]
       

### Scan through the emails and copy to spreadsheet
Go through each email in the Oracle email folder and:
    1. if more recent that what is currently in spreasheet then process it
    2. parse the data for the snaphot table and panel 5 history table
    3. write the snapshot data to spreadsheet
    4. record the most recent history for P5, deleting the existing history
    5. save the updated spreadsheet

In [41]:
##################################### Get Outlook Oracle email data and copy to spreadsheet ################################

###### Open outlook and get emails 

messages = open_email(6, 8)

for message in messages:
    message_sent_date = message.SentOn.strftime("%d/%m/%y")
    message_sent_date_excel = '=datevalue("' + message_sent_date +'")'
    message_sent_date_python = calendar.timegm(time.strptime(message_sent_date, "%d/%m/%y"))

    if message_sent_date_python > table_snapshot_recent_date:
        message_split = message.body.splitlines()
        processed_message = loop_through_message(message_split, num_tables_to_process, table_schemas, start_table_pos, start_table_char, end_table_pos, end_table_char)
        spreadsheet_output_row[1] = delete_data_from_cells (ws, spreadsheet_first_data_row, table_output_col_by_table[1], table_2_num_cols)
       
        spreadsheet_output_row = copy_tables_to_spreadsheet(ws, spreadsheet_output_row, processed_message, table_output_col_by_table, message_sent_date, table_date_col)

    
# Save the combined spreadsheet
wb.save(spreadsheet_to_open)  