In [6]:

# This is important: to process XLS we use XLWINGS library, but if you have issues with it
# you might as well use XLRD library which looks like more low level, but probably more stable

import xlwings as xls

import os
import numpy as np
import pandas as pd
import imp
from time import gmtime, strftime




In [7]:
inputlist=[]
inputlist.append("testdata\example_expenses01.xlsx")
inputlist.append("testdata\example_expenses02.xlsx")


Here we demonstrate how to load tables from several excel sheets (from a single workbook) into a single dataframe
Bear in mind that I found xlwings had issues with reading very large xls files, give it a try but if it fails try another reader.


In [8]:

#this is the workbook object
# this is a list of frames, we will load there the subframes and then we will "concatenate" them together
input_frames=[]
#another extra nicety we will ad some custom fields in the process, timestamp and origin sheet
ts= strftime("%Y-%m-%d %H:%M:%S")

for xlsfile in inputlist:
    wb = xls.Book(xlsfile)
    #this is the pandas data frame object, we will load the workbook into the dataframe for superfast processing
    df =pd.DataFrame([])

    #as an extra nicety we will load all sheets that have the title "Name" in the cell A1 into a single dataframe
    #assuming they have the same structure, i.e. we assume a huge table split into several sheets, as sometimes we
    #get from large exports
    
    n= len(xls.sheets)

    print ("Total Sheets in worbook:"+ wb.name +" "+ str(n))

    for x in range(0, n):
        sh = wb.sheets[x]
        cellA1= sh.range("A1").value
        
        if cellA1=="Name" or sh.name =="Data":
            print ("Processing sheet " + str(x) + ":" + sh.name   ) 

            #This is equivalent to "select region" from the first cell, not sure if it works
            data = sh.range("A1").expand().value

            #data= xls.Range(sh,"A1:E100000").value
            #We add here the data from the second row,and we add as field names the first row
            # in python index starts a 0
            frame=pd.DataFrame(data[1:],columns=data[0])
            #We add here a column for the source sheet and a timestamp
            frame["SourceFile"]= xlsfile
            frame["SourceSheet"]=sh.name
            frame["Timestamp"]= ts

            input_frames.append (frame)
      
        
    wb.close()
    
df = pd.concat(input_frames, ignore_index=True)
print ("Top 10 records")
print (df[0:10])

Total Sheets in worbook:example_expenses01.xlsx 2
Processing sheet 0:AuditTrips
Total Sheets in worbook:example_expenses02.xlsx 3
Processing sheet 0:Events
Processing sheet 1:Other
Top 10 records
     Name    Title  Expenses       Date Date Claim  CostCentre  ExpenseType  \
0    John   Senior    1000.0 2014-08-15 2014-09-24      1001.0          8.0   
1   Peter  Manager    1500.0 2014-07-01 2014-08-30      1001.0         13.0   
2    Arne   Senior      -1.0 2014-07-01 2014-08-10      1001.0         13.0   
3   Hiten   Senior       NaN 2015-01-01 2015-02-10      1001.0          8.0   
4  Najwan   Senior     830.0 2015-10-01 2015-11-10      1001.0          8.0   
5   Peter  Manager    3000.0 2015-11-30 2016-01-09      1001.0          3.0   
6    John   Senior      14.0 2014-08-15 2014-09-24      1001.0          7.0   
7   Peter  Manager    4436.0 2014-07-01 2014-08-30      1001.0          1.0   
8    Arne   Senior     299.0 2014-07-01 2014-08-10      1001.0          5.0   
9   Hiten   Se

# Overall totals and reconciliation checks (Group By)

In [9]:
print ("Record count (non null):")
print (df.count())
print ("Min value:"+ str(df["Expenses"].min()))
print ("Max value:" + str(df["Expenses"].max()))
print ("Records per person:" )
df['Name'].value_counts()
print ("Totals per person:")
df.groupby('Name')['Expenses'].sum()

Record count (non null):
Name           451
Title          451
Expenses       450
Date           451
Date Claim     451
CostCentre     451
ExpenseType    451
SourceFile     451
SourceSheet    451
Timestamp      451
dtype: int64
Min value:-1.0
Max value:4436.0
Records per person:
Totals per person:


Name
Anshu        2458.7
Arne         7583.9
Arpad        4711.5
Frank        1310.3
Hiten        4322.5
John        12553.9
Juan          260.0
Laura        5092.5
Mary         1460.0
Matthew      5517.9
Melany        469.5
Michaela    19467.3
Najwan       1598.1
Peter       21168.2
Roshan        981.7
Vishal       4427.0
Name: Expenses, dtype: float64

Here we demonstrate how aggregation works.
This is a fairly complex case where we define a structure of dictionaries with the original fields and then the computation in pairs of "name:function" eg. "min_expenses:min"
Simpler aggregation is also demonstrated


In [18]:
df.groupby(['Name']).agg({'Expenses':{'min_expenses':'min', 'max_expenses':'max', 'total_expenses':'sum'},'Name': 'count', 'Date': {'First_date': 'min'}})  

Unnamed: 0_level_0,Expenses,Expenses,Expenses,Name,Date
Unnamed: 0_level_1,min_expenses,max_expenses,total_expenses,count,First_date
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Anshu,45.1,1000.0,2458.7,10,2014-08-15
Arne,-1.0,2000.0,7583.9,32,2014-07-01
Arpad,11.8,1500.0,4711.5,23,2014-07-01
Frank,209.9,1100.4,1310.3,2,2015-01-01
Hiten,34.0,2000.0,4322.5,16,2015-01-01
John,14.0,1000.0,12553.9,71,2014-08-15
Juan,23.0,237.0,260.0,2,2015-10-01
Laura,11.1,291.7,5092.5,37,2015-01-03
Mary,79.0,277.7,1460.0,10,2015-01-13
Matthew,16.3,2000.0,5517.9,28,2015-01-29


#  Queries for exceptions (Where)

In [12]:
# First test negative expenses, note the way we do "AND"

dfex01= df[(df.Title == "Manager") & (df.SourceSheet == "Events")]
dfex01

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,12.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,6.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
147,Frank,Manager,1100.4,2015-01-01,2015-02-10,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
154,Peter,Manager,99.0,2015-07-07,2015-10-24,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
155,Anshu,Manager,167.2,2015-09-30,2016-01-28,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
158,Peter,Manager,133.0,2015-07-07,2015-09-27,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
159,Arpad,Manager,235.5,2015-03-15,2015-04-03,1002.0,11.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
160,Melany,Manager,220.4,2015-03-29,2015-05-09,1002.0,1.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14


In [13]:
# Second test is some date and amount cutoff extraction
dfex02= df.query('Expenses>0 and Date > "2015-01-01"')
dfex02

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
4,Najwan,Senior,830.0,2015-10-01,2015-11-10,1001.0,8.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
5,Peter,Manager,3000.0,2015-11-30,2016-01-09,1001.0,3.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
10,Juan,Senior,23.0,2015-10-01,2015-11-10,1001.0,12.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
11,Peter,Manager,69.0,2015-11-30,2016-01-09,1001.0,6.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
12,Mary,Manager,79.0,2015-05-12,2015-06-28,1001.0,9.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
13,Laura,Director,67.2,2015-07-28,2015-11-09,1001.0,11.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
14,John,Senior,44.6,2015-07-11,2015-10-09,1001.0,8.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
15,Laura,Director,152.4,2015-01-04,2015-01-14,1001.0,7.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
16,Peter,Manager,106.5,2015-10-08,2015-12-15,1001.0,10.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
17,Arne,Senior,158.0,2015-04-30,2015-06-27,1001.0,7.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14


In [14]:

# Test for same date and amount, maybe should add category or vendor if available, to spot people double posting (e.g. in a group trip) 
df[df.duplicated(["Date","Expenses"], keep=False)]

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
0,John,Senior,1000.0,2014-08-15,2014-09-24,1001.0,8.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
1,Peter,Manager,1500.0,2014-07-01,2014-08-30,1001.0,13.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
2,Arne,Senior,-1.0,2014-07-01,2014-08-10,1001.0,13.0,testdata\example_expenses01.xlsx,AuditTrips,2017-05-08 20:09:14
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,12.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,6.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
150,Arne,Senior,2000.0,2015-08-31,2015-10-10,1002.0,10.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
151,Hiten,Senior,2000.0,2015-08-31,2015-10-10,1002.0,11.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
152,Michaela,Senior,300.0,2015-04-20,2015-04-30,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14


In [15]:
#Test for duplicate expenses , we now go to detail
dfex03=df[df.duplicated(["Name","Title","Date","Expenses"], keep=False)]
dfex03

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
152,Michaela,Senior,300.0,2015-04-20,2015-04-30,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
271,Michaela,Senior,300.0,2015-04-20,2015-05-30,1003.0,11.0,testdata\example_expenses02.xlsx,Other,2017-05-08 20:09:14
275,Peter,Manager,2000.0,2015-08-31,2015-10-10,1003.0,13.0,testdata\example_expenses02.xlsx,Other,2017-05-08 20:09:14


In [16]:
#We create a dataframe where we accumulate exceptions

dfex =pd.DataFrame([])
dfex= dfex.append(dfex01)
dfex= dfex.append(dfex02)
dfex= dfex.append(dfex03)
dfex

Unnamed: 0,Name,Title,Expenses,Date,Date Claim,CostCentre,ExpenseType,SourceFile,SourceSheet,Timestamp
144,Anshu,Manager,1000.0,2014-08-15,2014-09-24,1002.0,12.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
145,Arpad,Manager,1500.0,2014-07-01,2014-08-10,1002.0,6.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
146,Melany,Manager,-1.0,2014-07-01,2014-08-10,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
147,Frank,Manager,1100.4,2015-01-01,2015-02-10,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
149,Peter,Manager,2000.0,2015-08-31,2015-11-29,1002.0,4.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
154,Peter,Manager,99.0,2015-07-07,2015-10-24,1002.0,9.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
155,Anshu,Manager,167.2,2015-09-30,2016-01-28,1002.0,2.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
158,Peter,Manager,133.0,2015-07-07,2015-09-27,1002.0,3.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
159,Arpad,Manager,235.5,2015-03-15,2015-04-03,1002.0,11.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14
160,Melany,Manager,220.4,2015-03-29,2015-05-09,1002.0,1.0,testdata\example_expenses02.xlsx,Events,2017-05-08 20:09:14


In [17]:
#We write the new excel , dumping the dataframe
try:
    os.remove('exceptions.xlsx')
except:
    pass
try:
    writer = pd.ExcelWriter('exceptions.xlsx', engine='xlsxwriter', options={'remove_timezone':True})
    dfex.to_excel(writer, sheet_name='Sheet1')
    print ("File created")
except:
    print ("error creating/saving excel file")

File created
