In [2]:
import pandas as pd
import glob # used to read file directories, there are alternatives: pathlib.Path.rglob or os.walk

Often you need to consolidate several spreadsheets/tabs and you need to make sure that they have the exact same structure. This routine below does that, allows you to check fields and columns and do an initial tally of how many records the routine would be seeing before attempting to append them.

Note that if the structure is complex or starting from the x row or column or there are artefacts like totals or values outside the main rainge it will likely fail or bring unwanted fields/values.
There are ways of readling specific portions of a sheet, but your best bet is to try to get the format right before ingesting into a python routine.

In [3]:
def check_xls_tables_from_dir(path):
    files= glob.glob(path, recursive=True)
    df= pd.DataFrame()
    d = []
    
    for f in files:
        print(f)
        xl = pd.ExcelFile(f)
        for s in xl.sheet_names:
            data=xl.parse(s)
            data.columns=[x.lower() for x in data.columns.to_list()]
            r,c= data.shape
            ditem={
                    'ori_file': f,
                    'ori_sheet': s,
                    'ori_records':r,
                    'ori_columns':c                
                }
            for col in data.columns:
                ditem[col]=data[col].count()
            d.append( ditem)
    df=pd.DataFrame(d)
    return df
   
dfstructure= check_xls_tables_from_dir("./test_data/batch/*.xlsx")
dfstructure.head() 
    
    
    

./test_data/batch\Q1.xlsx
./test_data/batch\Q2.xlsx


Unnamed: 0,ori_file,ori_sheet,ori_records,ori_columns,customer_id,date,transaction_id,amount,result
0,./test_data/batch\Q1.xlsx,Jan,1951,5,1951,1951,1951,1950,1951
1,./test_data/batch\Q1.xlsx,Feb,1881,5,1881,1881,1881,1881,1881
2,./test_data/batch\Q1.xlsx,Mar,1933,5,1933,1933,1933,1933,1933
3,./test_data/batch\Q2.xlsx,April,1967,5,1967,1967,1967,1967,1967
4,./test_data/batch\Q2.xlsx,May,2278,5,2278,2278,2278,2277,2278


This is the key function that imports all the spreadsheets in the path given (wildcards allowed) and would go through all the sheets and load each of them and append to a dataframe

There are various possible tweaks you can do, to just import a specific sheet, or potentially a specific range.
Also you need to look for inconsistencies in column naming, Python/Pandas are typically case sensitive, the routine does take it into account by converting everything to lower case.
However, things like spaces or accented chars or duplicate column names might create unwanted artefacts. 
You need to have done some checks in advance to ensure the import is successfull

In [4]:
def load_xls_from_dir(path):
    files= glob.glob(path, recursive=True)
    df= pd.DataFrame()
    for f in files:
        #Option 1: read just the first sheet of book
        #data=pd.read_excel(f) 
        #df=df.append(data, sort=False)
        #Option 2: loop through each tab in each workbook and add id fields
        xl = pd.ExcelFile(f)
        for s in xl.sheet_names:
            data=xl.parse(s)
            # We change to lowercase as field names are case sensitive
            data.columns=[x.lower() for x in data.columns.to_list()]
            data["ori_file"]= f
            data["ori_sheet"]=s
            df=df.append(data, sort=False)
    return df
   

In [5]:
df= load_xls_from_dir("./test_data/batch/*.xlsx")
df.head()

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
0,150496,2020-01-01,1,682.0,Authorised,./test_data/batch\Q1.xlsx,Jan
1,464857,2020-01-01,2,5386.0,Authorised,./test_data/batch\Q1.xlsx,Jan
2,68468,2020-01-01,3,26014.0,Authorised,./test_data/batch\Q1.xlsx,Jan
3,243698,2020-01-01,4,6400.0,Authorised,./test_data/batch\Q1.xlsx,Jan
4,45755,2020-01-01,5,29706.0,Authorised,./test_data/batch\Q1.xlsx,Jan


In [6]:
df.columns=[x.lower() for x in df.columns.to_list()]
df.columns=df.columns.str.replace(' ','_')
df.head()

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
0,150496,2020-01-01,1,682.0,Authorised,./test_data/batch\Q1.xlsx,Jan
1,464857,2020-01-01,2,5386.0,Authorised,./test_data/batch\Q1.xlsx,Jan
2,68468,2020-01-01,3,26014.0,Authorised,./test_data/batch\Q1.xlsx,Jan
3,243698,2020-01-01,4,6400.0,Authorised,./test_data/batch\Q1.xlsx,Jan
4,45755,2020-01-01,5,29706.0,Authorised,./test_data/batch\Q1.xlsx,Jan


In [7]:
df.sort_values('amount', axis=0, ascending=True)

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
31,94819,2020-02-01,1983,-5000.0,Authorised,./test_data/batch\Q1.xlsx,Feb
910,49009,2020-02-29,4743,1.0,Authorised,./test_data/batch\Q1.xlsx,Mar
1437,441895,2020-02-15,3389,1.0,Authorised,./test_data/batch\Q1.xlsx,Feb
1496,76082,2020-05-31,11505,11.0,Authorised,./test_data/batch\Q2.xlsx,June
1579,496898,2020-05-31,11588,12.0,Authorised,./test_data/batch\Q2.xlsx,June
...,...,...,...,...,...,...,...
26,55659,2020-04-02,5792,39990.0,Authorised,./test_data/batch\Q2.xlsx,April
456,230909,2020-02-05,2408,39992.0,Authorised,./test_data/batch\Q1.xlsx,Feb
1851,304071,2020-02-19,3803,39999.0,Authorised,./test_data/batch\Q1.xlsx,Feb
1950,183416,2020-01-31,1951,,Rejected,./test_data/batch\Q1.xlsx,Jan


In [8]:
df.sort_values('date', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
2148,396760,2020-05-31,12157,8748.0,Authorised,./test_data/batch\Q2.xlsx,June
946,493002,2020-05-31,10955,610.0,Authorised,./test_data/batch\Q2.xlsx,June
858,30394,2020-05-31,10867,6823.0,Authorised,./test_data/batch\Q2.xlsx,June
859,422636,2020-05-31,10868,23120.0,Authorised,./test_data/batch\Q2.xlsx,June
860,429733,2020-05-31,10869,17023.0,Authorised,./test_data/batch\Q2.xlsx,June
861,195961,2020-05-31,10870,12496.0,Authorised,./test_data/batch\Q2.xlsx,June
862,348365,2020-05-31,10871,34061.0,Authorised,./test_data/batch\Q2.xlsx,June
863,95928,2020-05-31,10872,22681.0,Authorised,./test_data/batch\Q2.xlsx,June
864,164313,2020-05-31,10873,4609.0,Authorised,./test_data/batch\Q2.xlsx,June
865,185793,2020-05-31,10874,11432.0,Authorised,./test_data/batch\Q2.xlsx,June


Lets look for some duplicates

The command Duplicated admits the parameter "keep"  
- first : Bring first occurrence.  
- last : Bring last occurrence.  
- False : Bring all duplicate records.  


In [12]:
df[df.duplicated(keep=False)] # first full record

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
20,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May
21,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May


In [13]:
df[df.transaction_id.duplicated(keep=False)] # then transaction id

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
19,339698,2020-05-01,7752,10957.0,Authorised,./test_data/batch\Q2.xlsx,May
20,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May
21,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May


In [14]:
df2= df[df[["amount","customer_id"]].duplicated(keep=False)] # transaction id
df2

Unnamed: 0,customer_id,date,transaction_id,amount,result,ori_file,ori_sheet
20,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May
21,209024,2020-05-01,7752,15583.0,Authorised,./test_data/batch\Q2.xlsx,May


In [15]:
df2.to_excel("./output/101_Output_Exceptions.xlsx", sheet_name='Duplicates')