Load a simple MS Excel spreadsheet and list it

In [1]:
import pandas as pd 

In [2]:
df= pd.read_excel("./test_data/101_Demo_Data.xlsx", sheet_name="Data")

In [3]:
df.shape #shows the number of columns and rows

(1708, 5)

In [4]:
df.columns.to_list()

['Auth_ID', 'Date', 'Cust ID', 'Amount', 'Date.1']

We turn headers to lower case as python is case sensitive and it is better for consistency

In [5]:
df.columns=[x.lower() for x in df.columns.to_list()]
df[0:10]

Unnamed: 0,auth_id,date,cust id,amount,date.1
0,9000001,2020-03-22 13:20:10.350,26009,35.31,2020-04-22 13:20:10.350
1,9000002,2020-03-22 14:32:10.350,29752,19.71,2020-04-22 14:32:10.350
2,9000003,2020-03-22 15:44:10.350,6435,54.87,2020-04-22 15:44:10.350
3,9000004,2020-03-22 18:08:10.350,3233,58.51,2020-04-22 18:08:10.350
4,9000005,2020-03-22 19:01:30.350,25731,101.21,2020-04-22 19:01:30.350
5,9000006,2020-03-22 20:17:17.718,5728,32.95,2020-04-22 20:17:17.718
6,9000007,2020-03-22 21:29:17.718,26333,55.08,2020-04-22 21:29:17.718
7,9000008,2020-03-22 22:31:54.240,24002,41.08,2020-04-22 22:31:54.240
8,9000009,2020-03-22 23:23:19.954,20867,98.29,2020-04-22 23:23:19.954
9,9000010,2020-03-23 00:09:47.051,9497,0.37,2020-04-23 00:09:47.051


we remove spurious characters from the title

In [6]:
df.columns=df.columns.str.replace(' ','_')
df.columns=df.columns.str.replace('.','_')

df.iloc[0] #this is how we list one particular record based on the index

auth_id                       9000001
date       2020-03-22 13:20:10.350000
cust_id                         26009
amount                          35.31
date_1     2020-04-22 13:20:10.350000
Name: 0, dtype: object

This is the simpler way, we just set a new row of titles

In [7]:
new_cols = ['auth_id', 
            'date', 
            'customer_id',
           'amount',
           'exp_date']
df.columns = new_cols

df.head() #this is a convenient way of showing the first 5 records

Unnamed: 0,auth_id,date,customer_id,amount,exp_date
0,9000001,2020-03-22 13:20:10.350,26009,35.31,2020-04-22 13:20:10.350
1,9000002,2020-03-22 14:32:10.350,29752,19.71,2020-04-22 14:32:10.350
2,9000003,2020-03-22 15:44:10.350,6435,54.87,2020-04-22 15:44:10.350
3,9000004,2020-03-22 18:08:10.350,3233,58.51,2020-04-22 18:08:10.350
4,9000005,2020-03-22 19:01:30.350,25731,101.21,2020-04-22 19:01:30.350


Yet another way of renaming dataframe columns, the inplace=True is used in Pandas to avoid having to assign the result to a dataframe, e.g. df= df.rename(.....)

In [8]:
df.rename(columns = {'date':'auth_date'},inplace = True)

df.tail() #shows the last 5 records

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1703,9001702,2020-06-19 07:35:56.585,1996,47.38,2020-07-20 07:35:56.585
1704,9001703,2020-06-19 08:47:56.585,2149,29.25,2020-07-20 08:47:56.585
1705,9001704,2020-06-19 09:20:40.222,3541,73.36,2020-07-20 09:20:40.222
1706,9001705,2020-06-19 10:20:40.222,19535,15.78,2020-07-20 10:20:40.222
1707,9001706,2020-06-19 11:29:14.508,24129,-3.82,2020-07-20 11:29:14.508


In [9]:
df[df["auth_id"]==9001703]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1704,9001703,2020-06-19 08:47:56.585,2149,29.25,2020-07-20 08:47:56.585


In [10]:
df.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1707,9001706,2020-06-19 11:29:14.508,24129,-3.82,2020-07-20 11:29:14.508
1706,9001705,2020-06-19 10:20:40.222,19535,15.78,2020-07-20 10:20:40.222
1705,9001704,2020-06-19 09:20:40.222,3541,73.36,2020-07-20 09:20:40.222
1704,9001703,2020-06-19 08:47:56.585,2149,29.25,2020-07-20 08:47:56.585
1703,9001702,2020-06-19 07:35:56.585,1996,47.38,2020-07-20 07:35:56.585
1702,9001701,2020-06-19 06:23:56.585,25135,9.72,2020-07-20 06:23:56.585
1701,9001700,2020-06-19 04:23:56.585,16366,-11.2,2020-07-20 04:23:56.585
1700,9001699,2020-06-19 03:28:33.509,15365,-37.08,2020-07-20 03:28:33.509
1699,9001698,2020-06-19 02:08:33.509,27186,9.22,2020-07-20 02:08:33.509
1698,9001697,2020-06-19 01:03:06.236,18082,39.44,2020-07-20 01:03:06.236


In [11]:
df[(df.amount>100) & (df.auth_date.dt.month==3)]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
168,9000169,2020-03-31 10:06:48.136,13322,114.62,2020-05-01 10:06:48.136
4,9000005,2020-03-22 19:01:30.350,25731,101.21,2020-04-22 19:01:30.350


This is how you test for duplicates

In [12]:
df[df.auth_id.duplicated(keep=False)]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
571,9000570,2020-04-21 08:18:13.943,15776,-43.65,2020-05-22 08:18:13.943
570,9000570,2020-04-21 08:18:13.943,15776,43.65,2020-05-22 08:18:13.943
403,9000404,2020-04-12 10:14:44.431,7433,20.56,2020-05-13 10:14:44.431
404,9000404,2020-04-12 10:14:44.431,7433,20.56,2020-05-13 10:14:44.431


This is not the best way to test for duplicates, but can do the trick

In [13]:
df2= df.groupby(["auth_id"], as_index = False ).count()
# If you dont include the as_index, the DF will have nested indexes, so to get the data use:
# list= df2.index.get_level_values(1).tolist() 

df2[df2.auth_date>1]


Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
403,9000404,2,2,2,2
569,9000570,2,2,2,2


In [14]:
df2.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df2[0:10]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
403,9000404,2,2,2,2
569,9000570,2,2,2,2
0,9000001,1,1,1,1
1135,9001136,1,1,1,1
1144,9001145,1,1,1,1
1143,9001144,1,1,1,1
1142,9001143,1,1,1,1
1141,9001142,1,1,1,1
1140,9001141,1,1,1,1
1139,9001140,1,1,1,1


In [16]:
df2.to_excel("./output/101_Output_Grouped.xlsx", sheet_name='Grouped')