In [1]:
#! Python 3
# This is written in python version 3, Anaconda distribution

# This is important: to process XLS we use XLWINGS library, but if you have issues with it
# you might as well use XLRD library which looks like more low level, but probably more stable

# These imports various libraries that are needed to complete the routine, they all should be already in your
# distribution, otherwise, you need to install them using the "pip" tool


# This library is to process an excel file
import xlwings as xls

# This library is to do file/os operations
import os

# these two are for data analytics
import numpy as np
import pandas as pd

#import imp
from time import gmtime, strftime




Here we demonstrate how to load tables from several excel sheets (from a single workbook) into a single dataframe
Bear in mind that I found xlwings had issues with reading very large xls files, give it a try but if it fails try another reader.


In [2]:
# we define a list of strings that will be the path to the sources
# note the forward slash for the path

inputlist=[]
inputlist.append("testdata/example_expenses01.xlsx")
inputlist.append("testdata/example_expenses02.xlsx")


In [4]:

# This is the workbook object
# it is a list of data frames, we will load there the subframes and then we will "concatenate" them together
# data frames is kind of a "spreadsheet" or "table" for Pandas, the library we will use for processing data
input_frames=[]

#another extra nicety we will ad some custom fields in the process, timestamp and origin sheet
ts= strftime("%Y-%m-%d %H:%M:%S")

for xlsfile in inputlist:
    #try is to do error catching, if it fails we handle that gracefully
    try:
        wb = xls.Book(xlsfile)
        #this is the pandas data frame object, we will load the workbook into the dataframe for superfast processing
        df =pd.DataFrame([])
        #as an extra nicety we will load all sheets that have the title "Name" in the cell A1 into a single dataframe
        #assuming they have the same structure, i.e. we assume a huge table split into several sheets, as sometimes we
        #get from large exports
        n= len(xls.sheets)
        print ("Total Sheets in worbook:"+ wb.name +" "+ str(n))    
        for x in range(0, n):
            sh = wb.sheets[x]
            cellA1= sh.range("A1").value        
            if cellA1=="Name" or sh.name =="Data":
                print ("Processing sheet " + str(x) + ":" + sh.name   ) 
                #This is equivalent to "select region" from the first cell, not sure if it works
                data = sh.range("A1").expand().value
                #data= xls.Range(sh,"A1:E100000").value
                #We add here the data from the second row,and we add as field names the first row
                # in Python index starts a 0
                frame=pd.DataFrame(data[1:],columns=data[0])
                #We add here a column for the source sheet and a timestamp of the processing
                frame["SourceFile"]= xlsfile
                frame["SourceSheet"]=sh.name
                frame["Timestamp"]= ts
                input_frames.append (frame)
        wb.close()
    except:
        print("Unable to open file:" + xlsfile )  
if len(input_frames)>0: 
    df = pd.concat(input_frames, ignore_index=True)
    print ("Top 10 records")
    print (df[0:10])
else:
    print("Error! No records loaded")

Unable to open file:testdata/example_expenses01.xlsx
Unable to open file:testdata/example_expenses02.xlsx
Error! No records loaded


# Overall totals and reconciliation checks (Group By)

In [13]:
print ("Summary of records with non null" )
print (df.count())
print()
print ("Min value:"+ str(df["Expenses"].min()))
print ("Max value:" + str(df["Expenses"].max()))
print ("Totals per person:")
print (df.groupby('Name')['Expenses'].sum())

Summary of records with non null
Name           451
Title          451
Expenses       450
Date           451
Date Claim     451
CostCentre     451
ExpenseType    451
SourceFile     451
SourceSheet    451
Timestamp      451
dtype: int64

Min value:-1.0
Max value:4436.0
Totals per person:
Name
Anshu        1147.9
Arne         9270.5
Arpad        4007.8
Frank        3687.9
Hiten        5666.7
John        15993.0
Juan         2433.3
Laura        1886.7
Mary         1423.6
Matthew      6835.3
Melany       2253.4
Michaela     3529.4
Najwan       2401.3
Peter       23807.0
Roshan       1709.4
Vishal       3894.6
Name: Expenses, dtype: float64


# "Group by" queries

Here we demonstrate how aggregation works.
This is a fairly complex case where we define a structure of dictionaries with the original fields and then the computation in pairs of "name:function" eg. "min_expenses:min"
Simpler aggregation is also demonstrated


In [19]:
df.groupby(['Name']).agg({'Expenses': 'min', 'Expenses':'max'})



Unnamed: 0_level_0,Expenses
Name,Unnamed: 1_level_1
Anshu,1000.0
Arne,2000.0
Arpad,1500.0
Frank,1100.4
Hiten,2000.0
John,1000.0
Juan,282.8
Laura,270.2
Mary,288.4
Matthew,2000.0


#  "Where" queries (looking for specific exception records)

In [26]:
# First test zero or negative expenses, note the way we do "AND"

dfex01= df[(df.Title == "Manager") & (df.SourceSheet == "Events") & (df.Expenses<=0)]
dfex01

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55


In [30]:
# Second test is to spot transactions beyond initial cutoff date
dfex02= df.query('Expenses>0 and Date < "2015-01-01"')
dfex02

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
0,John,Senior,1000.0,2014-08-15,2014-09-24,1001.0,5.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
1,Peter,Manager,1500.0,2014-07-01,2014-08-30,1001.0,6.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
6,John,Senior,14.0,2014-08-15,2014-09-24,1001.0,4.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
7,Peter,Manager,4436.0,2014-07-01,2014-08-30,1001.0,7.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
8,Arne,Senior,299.0,2014-07-01,2014-08-10,1001.0,4.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,2.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,11.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
148,John,Senior,999.0,2014-08-15,2014-09-24,1002.0,3.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55


In [28]:
# Test for same date and same amount, 
# to spot people double posting (e.g. in a group trip) 
df[df.duplicated(["Date","Expenses"], keep=False)]

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
0,John,Senior,1000.0,2014-08-15,2014-09-24,1001.0,5.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
1,Peter,Manager,1500.0,2014-07-01,2014-08-30,1001.0,6.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
2,Arne,Senior,-1.0,2014-07-01,2014-08-10,1001.0,7.0,testdata/example_expenses01.xlsx,AuditTrips,2019-01-04 07:27:55
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,2.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,11.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,7.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
150,Arne,Senior,2000.0,2015-08-31,2015-10-10,1002.0,4.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
151,Hiten,Senior,2000.0,2015-08-31,2015-10-10,1002.0,12.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55
152,Michaela,Senior,300.0,2015-04-20,2015-04-30,1002.0,11.0,testdata/example_expenses02.xlsx,Events,2019-01-04 07:27:55


In [15]:
#Test for duplicate expenses , we now go to detail
dfex03=df[df.duplicated(["Name","Title","Date","Expenses"], keep=False)]
dfex03

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
152,Michaela,Senior,300.0,2015-04-20,2015-04-30,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
271,Michaela,Senior,300.0,2015-04-20,2015-05-30,1003.0,11.0,testdata\example_expenses02.xlsx,Other,2017-05-08 20:09:14
275,Peter,Manager,2000.0,2015-08-31,2015-10-10,1003.0,13.0,testdata\example_expenses02.xlsx,Other,2017-05-08 20:09:14


In [16]:
#We create a dataframe where we accumulate exceptions

dfex =pd.DataFrame([])
dfex= dfex.append(dfex01)
dfex= dfex.append(dfex02)
dfex= dfex.append(dfex03)
dfex

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,12.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,6.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
147,Frank,Manager,1100.4,2015-01-01,2015-02-10,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
154,Peter,Manager,99.0,2015-07-07,2015-10-24,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
155,Anshu,Manager,167.2,2015-09-30,2016-01-28,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
158,Peter,Manager,133.0,2015-07-07,2015-09-27,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
159,Arpad,Manager,235.5,2015-03-15,2015-04-03,1002.0,11.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
160,Melany,Manager,220.4,2015-03-29,2015-05-09,1002.0,1.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14


In [17]:
#We write the new excel , dumping the dataframe
try:
    os.remove('exceptions.xlsx')
except:
    pass
try:
    writer = pd.ExcelWriter('exceptions.xlsx', engine='xlsxwriter', options={'remove_timezone':True})
    dfex.to_excel(writer, sheet_name='Sheet1')
    print ("File created")
except:
    print ("error creating/saving excel file")

File created
