Load a simple MS Excel spreadsheet and list it

In [1]:
import pandas as pd 

In [2]:
df= pd.read_excel("test_data/101_Demo_Data.xlsx", sheet_name="Data")

In [13]:
df.shape #shows the number of columns and rows

(1708, 5)

In [14]:
df.columns.to_list()

['auth_id', 'date', 'customer_id', 'amount', 'exp_date']

We turn headers to lower case as python is case sensitive and it is better for consistency

In [15]:
df.columns=[x.lower() for x in df.columns.to_list()]
df[0:10]

Unnamed: 0,auth_id,date,customer_id,amount,exp_date
0,9000001,2019-12-23 19:05:00.340,27995,26.99,2020-01-23 19:05:00.340
1,9000002,2019-12-23 20:00:23.417,9970,69.13,2020-01-23 20:00:23.417
2,9000003,2019-12-23 20:55:46.494,10857,-9.63,2020-01-23 20:55:46.494
3,9000004,2019-12-23 22:11:33.862,22030,70.93,2020-01-23 22:11:33.862
4,9000005,2019-12-23 23:20:08.148,27662,54.89,2020-01-23 23:20:08.148
5,9000006,2019-12-24 00:28:42.434,16721,22.44,2020-01-24 00:28:42.434
6,9000007,2019-12-24 01:48:42.434,3393,22.17,2020-01-24 01:48:42.434
7,9000008,2019-12-24 03:08:42.434,7367,37.64,2020-01-24 03:08:42.434
8,9000009,2019-12-24 03:42:59.577,5257,19.31,2020-01-24 03:42:59.577
9,9000010,2019-12-24 04:42:59.577,21070,-16.44,2020-01-24 04:42:59.577


we remove spurious characters from the title

In [16]:
df.columns=df.columns.str.replace(' ','_')
df.columns=df.columns.str.replace('.','_')

df.iloc[0] #this is how we list one particular record based on the index

auth_id                           9000001
date           2019-12-23 19:05:00.340000
customer_id                         27995
amount                              26.99
exp_date       2020-01-23 19:05:00.340000
Name: 0, dtype: object

This is the simpler way, we just set a new row of titles

In [9]:
new_cols = ['auth_id', 
            'date', 
            'customer_id',
           'amount',
           'exp_date']
df.columns = new_cols

df.head() #this is a convenient way of showing the first 5 records

Unnamed: 0,auth_id,date,customer_id,amount,exp_date
0,9000001,2019-12-23 19:05:00.340,27995,26.99,2020-01-23 19:05:00.340
1,9000002,2019-12-23 20:00:23.417,9970,69.13,2020-01-23 20:00:23.417
2,9000003,2019-12-23 20:55:46.494,10857,-9.63,2020-01-23 20:55:46.494
3,9000004,2019-12-23 22:11:33.862,22030,70.93,2020-01-23 22:11:33.862
4,9000005,2019-12-23 23:20:08.148,27662,54.89,2020-01-23 23:20:08.148


Yet another way of renaming dataframe columns, the inplace=True is used in Pandas to avoid having to assign the result to a dataframe, e.g. df= df.rename(.....)

In [24]:
df.rename(columns = {'date':'auth_date'},inplace = True)

df.tail() #shows the last 5 records

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1703,9001702,2020-03-20 01:31:38.052,21221,37.17,2020-04-20 01:31:38.052
1704,9001703,2020-03-20 02:56:20.405,13400,36.09,2020-04-20 02:56:20.405
1705,9001704,2020-03-20 04:26:20.405,6771,33.41,2020-04-20 04:26:20.405
1706,9001705,2020-03-20 05:31:47.678,28636,54.64,2020-04-20 05:31:47.678
1707,9001706,2020-03-20 06:31:47.678,7795,4.87,2020-04-20 06:31:47.678


In [11]:
df[df["auth_id"]==9001703]

Unnamed: 0,auth_id,date,customer_id,amount,exp_date
1704,9001703,2020-03-20 02:56:20.405,13400,36.09,2020-04-20 02:56:20.405


In [25]:
df.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1707,9001706,2020-03-20 06:31:47.678,7795,4.87,2020-04-20 06:31:47.678
1706,9001705,2020-03-20 05:31:47.678,28636,54.64,2020-04-20 05:31:47.678
1705,9001704,2020-03-20 04:26:20.405,6771,33.41,2020-04-20 04:26:20.405
1704,9001703,2020-03-20 02:56:20.405,13400,36.09,2020-04-20 02:56:20.405
1703,9001702,2020-03-20 01:31:38.052,21221,37.17,2020-04-20 01:31:38.052
1702,9001701,2020-03-20 00:06:55.699,9648,68.8,2020-04-20 00:06:55.699
1701,9001700,2020-03-19 23:01:28.427,5227,56.18,2020-04-19 23:01:28.427
1700,9001699,2020-03-19 22:27:11.284,7101,21.27,2020-04-19 22:27:11.284
1699,9001698,2020-03-19 21:35:45.570,12309,50.19,2020-04-19 21:35:45.570
1698,9001697,2020-03-19 20:30:18.297,4223,89.76,2020-04-19 20:30:18.297


In [31]:
df[(df.amount>100) & (df.auth_date.dt.month==3)]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1668,9001667,2020-03-18 07:33:31.841,5934,103.23,2020-04-18 07:33:31.841
1625,9001624,2020-03-16 01:04:49.736,14138,113.77,2020-04-16 01:04:49.736
1590,9001589,2020-03-14 04:41:17.598,19736,103.96,2020-04-14 04:41:17.598
1588,9001587,2020-03-14 02:22:56.421,3523,126.21,2020-04-14 02:22:56.421
1348,9001347,2020-03-01 17:55:14.988,16862,112.67,2020-04-01 17:55:14.988


This is how you test for duplicates

In [37]:
df[df.auth_id.duplicated(keep=False)]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
571,9000570,2020-01-21 23:10:07.159,3136,-3.66,2020-02-21 23:10:07.159
570,9000570,2020-01-21 23:10:07.159,3136,3.66,2020-02-21 23:10:07.159
403,9000404,2020-01-13 11:14:25.365,8725,35.76,2020-02-13 11:14:25.365
404,9000404,2020-01-13 11:14:25.365,8725,35.76,2020-02-13 11:14:25.365


This is not the best way to test for duplicates, but can do the trick

In [32]:
df2= df.groupby(["auth_id"], as_index = False ).count()
# If you dont include the as_index, the DF will have nested indexes, so to get the data use:
# list= df2.index.get_level_values(1).tolist() 

df2[df2.auth_date>1]


Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
403,9000404,2,2,2,2
569,9000570,2,2,2,2


In [33]:
df2.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df2[0:10]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
403,9000404,2,2,2,2
569,9000570,2,2,2,2
0,9000001,1,1,1,1
1135,9001136,1,1,1,1
1144,9001145,1,1,1,1
1143,9001144,1,1,1,1
1142,9001143,1,1,1,1
1141,9001142,1,1,1,1
1140,9001141,1,1,1,1
1139,9001140,1,1,1,1


In [48]:
df2.to_excel("output/101_Output_Grouped.xlsx", sheet_name='Grouped')