Load a simple file in CSV and process it with Pandas library

In [15]:
import pandas as pd 
from datetime import date,timedelta, datetime

In [16]:
ts = datetime.now()
print("process started:", ts)

process started: 2020-11-28 19:54:30.867551


In [2]:
df= pd.read_csv("./test_data/101_Demo_Data.csv")

In [3]:
df.shape #shows the number of columns and rows

(1708, 8)

In [4]:
df.columns.to_list()

['Auth_ID',
 'Date Authorisation',
 'Cust ID',
 'Amount',
 'Date Procedure',
 'Region',
 'Vulnerable',
 'Notes']

We turn headers to lower case as python is case sensitive and it is better for consistency

In [5]:
df.columns=[x.lower() for x in df.columns.to_list()]
df[0:10]

Unnamed: 0,auth_id,date authorisation,cust id,amount,date procedure,region,vulnerable,notes
0,9000001,19/05/2020,3710,9.11,12/06/2020,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,19/05/2020,3378,11.65,21/07/2020,West,False,
2,9000003,19/05/2020,1963,13.52,21/06/2020,West,False,
3,9000004,19/05/2020,3565,77.57,23/05/2020,East,False,
4,9000005,19/05/2020,2249,21.37,26/07/2020,East,False,Customer may be vulnerable
5,9000006,19/05/2020,1669,12.6,26/07/2020,South,False,
6,9000007,19/05/2020,1589,26.89,11/06/2020,East,False,
7,9000008,19/05/2020,1596,43.98,16/06/2020,West,False,
8,9000009,19/05/2020,3507,6.09,06/08/2020,North,False,
9,9000010,19/05/2020,2543,11.36,27/06/2020,South,False,


we remove spurious characters from the title

In [6]:
df.columns=df.columns.str.strip()
df.columns=df.columns.str.replace(' ','_')
df.columns=df.columns.str.replace('.','_')
df.columns=df.columns.str.replace('__','_')
df.columns=df.columns.str.replace('__','_')
df.iloc[0] #this is how we list one particular record based on the index

auth_id                                                         9000001
date_authorisation                                           19/05/2020
cust_id                                                            3710
amount                                                             9.11
date_procedure                                               12/06/2020
region                                                           London
vulnerable                                                        False
notes                 Test of delimiter, this should be kept all tog...
Name: 0, dtype: object

This is the simpler way, we just set a new row of titles

In [7]:
new_cols = ['auth_id', 
            'date', 
            'customer_id',
           'amount',
           'date_claim',
           'region',
           'vulnerable_customer',
           'notes']
df.columns = new_cols

df.head() #this is a convenient way of showing the first 5 records

Unnamed: 0,auth_id,date,customer_id,amount,date_claim,region,vulnerable_customer,notes
0,9000001,19/05/2020,3710,9.11,12/06/2020,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,19/05/2020,3378,11.65,21/07/2020,West,False,
2,9000003,19/05/2020,1963,13.52,21/06/2020,West,False,
3,9000004,19/05/2020,3565,77.57,23/05/2020,East,False,
4,9000005,19/05/2020,2249,21.37,26/07/2020,East,False,Customer may be vulnerable


Yet another way of renaming dataframe columns, the inplace=True is used in Pandas to avoid having to assign the result to a dataframe, e.g. df= df.rename(.....)

In [8]:
df.rename(columns = {'date':'auth_date'},inplace = True)

df.tail() #shows the last 5 records

Unnamed: 0,auth_id,auth_date,customer_id,amount,date_claim,region,vulnerable_customer,notes
1703,9001701,14/08/2020,2212,37.38,,East,False,
1704,9001702,14/08/2020,1187,-7.79,,London,False,
1705,9001703,14/08/2020,2076,-13.82,,East,False,
1706,9001704,14/08/2020,2380,39.62,,West,False,
1707,9001705,14/08/2020,3980,10.56,30/08/2020,South,False,


In [10]:
df.dtypes

auth_id                  int64
auth_date               object
customer_id              int64
amount                 float64
date_claim              object
region                  object
vulnerable_customer     object
notes                   object
dtype: object

In [19]:
for col in df.columns.to_list():
    if col[0:4]=="date":#
        print(col)
        df[col]=pd.to_datetime(df[col],format='%d/%m/%Y')
df.dtypes


date_claim


auth_id                         int64
auth_date                      object
customer_id                     int64
amount                        float64
date_claim             datetime64[ns]
region                         object
vulnerable_customer            object
notes                          object
dtype: object

In [9]:
df[df["auth_id"]==9001703]

Unnamed: 0,auth_id,auth_date,customer_id,amount,date_claim,region,vulnerable_customer,notes
1705,9001703,14/08/2020,2076,-13.82,,East,False,


In [16]:

df.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,auth_id,auth_date,customer_id,amount,exp_date
1707,9001706,2020-06-19 11:29:14.508,24129,-3.82,2020-07-20 11:29:14.508
1706,9001705,2020-06-19 10:20:40.222,19535,15.78,2020-07-20 10:20:40.222
1705,9001704,2020-06-19 09:20:40.222,3541,73.36,2020-07-20 09:20:40.222
1704,9001703,2020-06-19 08:47:56.585,2149,29.25,2020-07-20 08:47:56.585
1703,9001702,2020-06-19 07:35:56.585,1996,47.38,2020-07-20 07:35:56.585
1702,9001701,2020-06-19 06:23:56.585,25135,9.72,2020-07-20 06:23:56.585
1701,9001700,2020-06-19 04:23:56.585,16366,-11.2,2020-07-20 04:23:56.585
1700,9001699,2020-06-19 03:28:33.509,15365,-37.08,2020-07-20 03:28:33.509
1699,9001698,2020-06-19 02:08:33.509,27186,9.22,2020-07-20 02:08:33.509
1698,9001697,2020-06-19 01:03:06.236,18082,39.44,2020-07-20 01:03:06.236


In [13]:
df[['auth_date','amount']].groupby([df['auth_date'].dt.strftime('%Y%m')]).agg({'auth_date':['min','max']},{'amount':['sum']})

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
df[(df.amount>100) & (df.auth_date.dt.month==3)]

This is how you test for duplicates

In [None]:
df[df.auth_id.duplicated(keep=False)]

This is not the best way to test for duplicates, but can do the trick

In [None]:
df2= df.groupby(["auth_id"], as_index = False ).count()
# If you dont include the as_index, the DF will have nested indexes, so to get the data use:
# list= df2.index.get_level_values(1).tolist() 

df2[df2.auth_date>1]


In [None]:
df2.sort_values('auth_date', axis=0, ascending=False, inplace=True)
df2[0:10]

In [None]:
df2.to_excel("./output/101_Output_Grouped.xlsx", sheet_name='Grouped')