Load a simple MS Excel spreadsheet and list it

In [1]:
import pandas as pd 

In [2]:
df= pd.read_excel("./test_data/101_Demo_Data.xlsx", sheet_name="Data")

In [3]:
df.shape #shows the number of columns and rows

(1708, 8)

In [4]:
df.columns.to_list()

['Auth_ID',
 'Date Authorisation',
 'Cust ID',
 'Amount',
 'Date Procedure',
 'Region',
 'Vulnerable',
 'Notes']

We turn headers to lower case as python is case sensitive and it is better for consistency

In [5]:
df.columns=[x.lower() for x in df.columns.to_list()]
df[0:10]

Unnamed: 0,auth_id,date authorisation,cust id,amount,date procedure,region,vulnerable,notes
0,9000001,2020-05-19 10:24:41.740000,3710,9.11,2020-06-12 10:24:41.740000,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,2020-05-19 11:49:24.092941,3378,11.65,2020-07-21 11:49:24.092941,West,False,
2,9000003,2020-05-19 13:09:24.092941,1963,13.52,2020-06-21 13:09:24.092941,West,False,
3,9000004,2020-05-19 13:40:02.390814,3565,77.57,2020-05-23 13:40:02.390814,East,False,
4,9000005,2020-05-19 15:00:02.390814,2249,21.37,2020-07-26 15:00:02.390814,East,False,Customer may be vulnerable
5,9000006,2020-05-19 15:45:02.390814,1669,12.6,2020-07-26 15:45:02.390814,South,False,
6,9000007,2020-05-19 16:47:38.912553,1589,26.89,2020-06-11 16:47:38.912553,East,False,
7,9000008,2020-05-19 18:38:25.066399,1596,43.98,2020-06-16 18:38:25.066399,West,False,
8,9000009,2020-05-19 20:29:11.220245,3507,6.09,2020-08-06 20:29:11.220245,North,False,
9,9000010,2020-05-19 21:59:11.220245,2543,11.36,2020-06-27 21:59:11.220245,South,False,


we remove spurious characters from the title

In [17]:
df.columns=df.columns.str.strip()
df.columns=df.columns.str.replace(' ','_',regex=False)
df.columns=df.columns.str.replace('.','_', regex=False)
df.columns=df.columns.str.replace('__','_')
df.columns=df.columns.str.replace('__','_')
df.iloc[0] #this is how we list one particular record based on the index

  df.columns=df.columns.str.replace('.','_')


auth_id                               9001705
date_auth          2020-08-14 13:03:18.536012
customer_id                              3980
amount                                  10.56
date_procedure     2020-08-30 13:03:18.536012
region                                  South
flag_vulnerable                         False
notes                                     NaN
Name: 1707, dtype: object

This is the simpler way, we just set a new row of titles

In [7]:
new_cols = ['auth_id', 
            'date', 
            'customer_id',
           'amount',
           'date_procedure',
           'region',
           'flag_vulnerable',
           'notes']
df.columns = new_cols

df.head() #this is a convenient way of showing the first 5 records

Unnamed: 0,auth_id,date,customer_id,amount,date_procedure,region,flag_vulnerable,notes
0,9000001,2020-05-19 10:24:41.740000,3710,9.11,2020-06-12 10:24:41.740000,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,2020-05-19 11:49:24.092941,3378,11.65,2020-07-21 11:49:24.092941,West,False,
2,9000003,2020-05-19 13:09:24.092941,1963,13.52,2020-06-21 13:09:24.092941,West,False,
3,9000004,2020-05-19 13:40:02.390814,3565,77.57,2020-05-23 13:40:02.390814,East,False,
4,9000005,2020-05-19 15:00:02.390814,2249,21.37,2020-07-26 15:00:02.390814,East,False,Customer may be vulnerable


Yet another way of renaming dataframe columns, the inplace=True is used in Pandas to avoid having to assign the result to a dataframe, e.g. df= df.rename(.....)

In [8]:
df.rename(columns = {'date':'date_auth'},inplace = True)

df.tail() #shows the last 5 records

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
1703,9001701,2020-08-14 08:20:44.651636,2212,37.38,NaT,East,False,
1704,9001702,2020-08-14 09:56:44.651636,1187,-7.79,NaT,London,False,
1705,9001703,2020-08-14 10:59:21.173375,2076,-13.82,NaT,East,False,
1706,9001704,2020-08-14 12:07:55.459089,2380,39.62,NaT,West,False,
1707,9001705,2020-08-14 13:03:18.536012,3980,10.56,2020-08-30 13:03:18.536012,South,False,


In [9]:
df[df["auth_id"]==9001703]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
1705,9001703,2020-08-14 10:59:21.173375,2076,-13.82,NaT,East,False,


In [10]:
df[['date_auth','amount']].groupby([df['date_auth'].dt.strftime('%Y%m')]).agg({'date_auth':['min','max']},{'amount':['sum']})

Unnamed: 0_level_0,date_auth,date_auth
Unnamed: 0_level_1,min,max
date_auth,Unnamed: 1_level_2,Unnamed: 2_level_2
202005,2020-05-19 10:24:41.740000,2020-05-31 23:54:04.030669
202006,2020-06-01 00:59:31.303396,2020-06-30 23:10:51.948949
202007,2020-07-01 00:02:17.663235,2020-07-31 23:12:04.138490
202008,2020-08-01 00:03:29.852776,2020-08-14 13:03:18.536012


In [11]:

df.sort_values('date_auth', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
1707,9001705,2020-08-14 13:03:18.536012,3980,10.56,2020-08-30 13:03:18.536012,South,False,
1706,9001704,2020-08-14 12:07:55.459089,2380,39.62,NaT,West,False,
1705,9001703,2020-08-14 10:59:21.173375,2076,-13.82,NaT,East,False,
1704,9001702,2020-08-14 09:56:44.651636,1187,-7.79,NaT,London,False,
1703,9001701,2020-08-14 08:20:44.651636,2212,37.38,NaT,East,False,
1702,9001700,2020-08-14 07:20:44.651636,3758,45.89,NaT,East,False,
1701,9001699,2020-08-14 06:27:24.651636,1072,20.04,NaT,South,False,
1700,9001698,2020-08-14 05:45:03.475166,2284,5.37,2020-10-20 05:45:03.475166,North,False,
1699,9001697,2020-08-14 05:13:45.214296,3441,12.01,NaT,West,False,
1698,9001696,2020-08-14 04:28:45.214296,2508,-28.21,2020-08-19 04:28:45.214296,West,False,


In [12]:
df[(df.amount<0) & (df.date_auth.dt.month==3)]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes


This is how you test for duplicates

In [13]:
df1=df[df.auth_id.duplicated(keep=False)]
df1

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
571,9000569,2020-06-17 12:55:45.509850,1121,-30.62,2020-09-11 12:55:45.509850,East,False,
570,9000569,2020-06-17 12:55:45.509850,3869,30.62,2020-07-13 12:55:45.509850,East,False,
403,9000403,2020-06-09 01:35:28.874651,1420,10.7,2020-07-13 01:35:28.874651,South,False,
404,9000403,2020-06-09 01:35:28.874651,2436,10.7,2020-08-03 01:35:28.874651,South,False,
333,9000333,2020-06-05 11:26:37.658059,3050,24.31,2020-08-14 11:26:37.658059,East,False,
332,9000333,2020-06-05 11:26:37.658059,3050,24.31,2020-08-14 11:26:37.658059,East,False,


This is not the best way to test for duplicates, but can do the trick

In [14]:
df2= df.groupby(["auth_id"], as_index = False ).count()
# If you dont include the as_index, the DF will have nested indexes, so to get the data use:
# list= df2.index.get_level_values(1).tolist() 

df2[df2.date_auth>1]


Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
332,9000333,2,2,2,2,2,2,0
402,9000403,2,2,2,2,2,2,0
568,9000569,2,2,2,2,2,2,0


In [21]:
df2.sort_values('date_auth', axis=0, ascending=False, inplace=True)
df2[0:10]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_procedure,region,flag_vulnerable,notes
402,9000403,2,2,2,2,2,2,0
568,9000569,2,2,2,2,2,2,0
332,9000333,2,2,2,2,2,2,0
0,9000001,1,1,1,1,1,1,1
1136,9001137,1,1,1,1,1,1,0
1145,9001146,1,1,1,1,1,1,0
1144,9001145,1,1,1,1,1,1,0
1143,9001144,1,1,1,1,1,1,0
1142,9001143,1,1,1,1,1,1,0
1141,9001142,1,1,1,1,1,1,0


In [16]:
df1.to_excel("./output/Output_duplicates.xlsx", sheet_name='Duplicates')