Load a simple file in CSV and process it with Pandas library

In [1]:
import pandas as pd 
import numpy as np
from datetime import date,timedelta, datetime
import re # Regular expression library 
import unicodedata # Used to downgrade unicode strings to ascii for cleaning up columns 

ts = datetime.now()
print("process started:", ts)

process started: 2021-03-20 13:50:29.642133


In [50]:
#These are convenience functions to use when printing tests results
def print_red(*args):
    print("\x1b[31m"+" ".join([str(x) for x in args])+"\x1b[0m")
def print_green(*args):
    print("\x1b[32m"+" ".join([str(x) for x in args])+"\x1b[0m")
print_red("testing, this should be printed in colour red")
print_green("testing, this should be printing in colour green")

[31mtesting, this should be printed in colour red[0m
[32mtesting, this should be printing in colour green[0m


In [2]:
folder_path="./output/"
save_to_excel_active=True

#This convenience function is to make saving exceptions to excel much cleaner.

def to_excel(df,name):
    print("Will save to excel: ",folder_path+name+".xlsx")
    print("Rows to save: "+ str(df.shape[0]))
    print("Columns to save: "+ str(df.shape[1]))
    if save_to_excel_active==True:
        df.to_excel(folder_path + name + ".xlsx", index=False,sheet_name=name, float_format="%.2f")
        print("Save completed")
    else:
        print ("Skipped saving")
    print("Timestamp completion: ", datetime.now())
    

In [3]:
df= pd.read_csv("./test_data/101_Demo_Data.csv")

In [9]:
df.shape #shows the number of columns and rows

(1708, 8)

In [5]:
df.columns.to_list()

['Auth_ID',
 'Date Authorisation',
 'Cust ID',
 'Amount',
 'Date Procedure',
 'Region',
 'Vulnerable',
 'Notes']

We turn headers to lower case as python is case sensitive and it is better for consistency

In [6]:
import re
import unicodedata 
def clean_string(s):
    # this is a fairly aggressive cleanup function, which also may be slow as it uses unicode checks and regular expression
    # if you are going to apply something like this to a large dataset you need to consider simpler and faster approaches
    s=unicodedata.normalize('NFKD', s).encode('ascii','ignore').decode("utf-8") # we convert unicode to a standard ascii
    s=s.lower()
    s=re.sub('[^a-z0-9 ]'," ",s) # Replace non letters/digits by space
    s=re.sub(' +',"_",s) # replace any space by underscore, multiple spaces collapsed
    s=re.sub("_$","",s) # last trailing space/underscore trimmed
    s=s.strip() # trim any space 
    return s
test_string="Test accented: àéêöñ \tVarious symbols: £$%#*€#~!.,    \nEnd"
print(test_string)
print(clean_string(test_string))

def clean_list(l):
    # apply the clean_string to a list, but checking that it doesnt create duplicate elements in the process
    # you may want to make this function customisable by having the duplicate check optional or applying some
    # other technique to deal with duplicates

    l_clean =[clean_string(x) for x in l]
    l_clean_no_dupes=[]
    for e in l_clean:
        if not (e in l_clean_no_dupes):
            l_clean_no_dupes.append(e)
        else:
            n=2
            new_e= e +"_"+ str(n)
            while new_e in l_clean_no_dupes:
                n=n+1
                new_e=e+"_"+str(n)
                if n>100000:
                    raise Exception("Sorry, too long trying to find a non duplicated name")
            l_clean_no_dupes.append(new_e)
    return l_clean_no_dupes
clean_list(["one","Two","Three and a half","One","Oñe","O N E"])


Test accented: àéêöñ 	Various symbols: £$%#*€#~!.,    
End
test_accented_aeeon_various_symbols_end


['one', 'two', 'three_and_a_half', 'one_2', 'one_3', 'o_n_e']

In [7]:
df.columns=clean_list(df.columns.to_list())
df[0:10]

Unnamed: 0,auth_id,date_authorisation,cust_id,amount,date_procedure,region,vulnerable,notes
0,9000001,19/05/2020,3710,9.11,12/06/2020,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,19/05/2020,3378,11.65,21/07/2020,West,False,
2,9000003,19/05/2020,1963,13.52,21/06/2020,West,False,
3,9000004,19/05/2020,3565,77.57,23/05/2020,East,False,
4,9000005,19/05/2020,2249,21.37,26/07/2020,East,False,Customer may be vulnerable
5,9000006,19/05/2020,1669,12.6,26/07/2020,South,False,
6,9000007,19/05/2020,1589,26.89,11/06/2020,East,False,
7,9000008,19/05/2020,1596,43.98,16/06/2020,West,False,
8,9000009,19/05/2020,3507,6.09,06/08/2020,North,False,
9,9000010,19/05/2020,2543,11.36,27/06/2020,South,False,


we remove spurious characters from the title

In [8]:
df.iloc[0] #this is how we list one particular record based on the index

auth_id                                                         9000001
date_authorisation                                           19/05/2020
cust_id                                                            3710
amount                                                             9.11
date_procedure                                               12/06/2020
region                                                           London
vulnerable                                                        False
notes                 Test of delimiter, this should be kept all tog...
Name: 0, dtype: object

This is the simpler way, we just set a new row of titles

In [10]:
new_cols = ['auth_id', 
            'date', 
            'customer_id',
           'amount',
           'date_claim',
           'region',
           'vulnerable_customer',
           'notes']
df.columns = new_cols

df.head() #this is a convenient way of showing the first 5 records

Unnamed: 0,auth_id,date,customer_id,amount,date_claim,region,vulnerable_customer,notes
0,9000001,19/05/2020,3710,9.11,12/06/2020,London,False,"Test of delimiter, this should be kept all tog..."
1,9000002,19/05/2020,3378,11.65,21/07/2020,West,False,
2,9000003,19/05/2020,1963,13.52,21/06/2020,West,False,
3,9000004,19/05/2020,3565,77.57,23/05/2020,East,False,
4,9000005,19/05/2020,2249,21.37,26/07/2020,East,False,Customer may be vulnerable


Yet another way of renaming dataframe columns, the inplace=True is used in Pandas to avoid having to assign the result to a dataframe, e.g. df= df.rename(.....)

In [11]:
df.rename(columns = {'date':'date_auth'},inplace = True)

df.tail() #shows the last 5 records

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_claim,region,vulnerable_customer,notes
1703,9001701,14/08/2020,2212,37.38,,East,False,
1704,9001702,14/08/2020,1187,-7.79,,London,False,
1705,9001703,14/08/2020,2076,-13.82,,East,False,
1706,9001704,14/08/2020,2380,39.62,,West,False,
1707,9001705,14/08/2020,3980,10.56,30/08/2020,South,False,


In [12]:
df.dtypes

auth_id                  int64
date_auth               object
customer_id              int64
amount                 float64
date_claim              object
region                  object
vulnerable_customer     object
notes                   object
dtype: object

In [13]:
for col in df.columns.to_list():
    if col[0:4]=="date":#
        print(col)
        df[col]=pd.to_datetime(df[col],format='%d/%m/%Y')
df.dtypes


date_auth
date_claim


auth_id                         int64
date_auth              datetime64[ns]
customer_id                     int64
amount                        float64
date_claim             datetime64[ns]
region                         object
vulnerable_customer            object
notes                          object
dtype: object

In [14]:
df[df["auth_id"]==9001703]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_claim,region,vulnerable_customer,notes
1705,9001703,2020-08-14,2076,-13.82,NaT,East,False,


In [15]:

df.sort_values('date_auth', axis=0, ascending=False, inplace=True)
df[0:10]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_claim,region,vulnerable_customer,notes
1707,9001705,2020-08-14,3980,10.56,2020-08-30,South,False,
1701,9001699,2020-08-14,1072,20.04,NaT,South,False,
1695,9001693,2020-08-14,2368,64.27,2020-10-20,North,False,
1696,9001694,2020-08-14,3583,-14.66,2020-09-21,East,False,
1698,9001696,2020-08-14,2508,-28.21,2020-08-19,West,False,
1699,9001697,2020-08-14,3441,12.01,NaT,West,False,
1700,9001698,2020-08-14,2284,5.37,2020-10-20,North,False,
1697,9001695,2020-08-14,3914,72.66,2020-11-09,London,False,
1702,9001700,2020-08-14,3758,45.89,NaT,East,False,
1703,9001701,2020-08-14,2212,37.38,NaT,East,False,


This is an obscure way of defining the column names for the aggregation, you can add lambda functions there

In [16]:
df[['date_auth','amount']].groupby([df['date_auth'].dt.strftime('%Y%m')]).agg(min_date=('date_auth','min'),max_date=("date_auth",'max'),total=('amount','sum'))

Unnamed: 0_level_0,min_date,max_date,total
date_auth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
202005,2020-05-19,2020-05-31,6967.56
202006,2020-06-01,2020-06-30,17785.36
202007,2020-07-01,2020-07-31,18434.82
202008,2020-08-01,2020-08-14,7799.8


In [17]:
df[(df.amount>100) & (df.date_auth.dt.month==5)]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_claim,region,vulnerable_customer,notes
206,9000207,2020-05-29,2398,104.12,2020-09-03,East,False,
185,9000186,2020-05-28,1301,100.34,NaT,East,False,


This is how you test for duplicates

In [18]:
df[df.auth_id.duplicated(keep=False)]

Unnamed: 0,auth_id,date_auth,customer_id,amount,date_claim,region,vulnerable_customer,notes
570,9000569,2020-06-17,3869,30.62,2020-07-13,East,False,
571,9000569,2020-06-17,1121,-30.62,2020-09-11,East,False,
404,9000403,2020-06-09,2436,10.7,2020-08-03,South,False,
403,9000403,2020-06-09,1420,10.7,2020-07-13,South,False,
333,9000333,2020-06-05,3050,24.31,2020-08-14,East,False,
332,9000333,2020-06-05,3050,24.31,2020-08-14,East,False,


This is not the best way to test for duplicates, but can do the trick

In [19]:
df2= df.groupby(["auth_id"] ).size().reset_index(name="count")
df2

Unnamed: 0,auth_id,count
0,9000001,1
1,9000002,1
2,9000003,1
3,9000004,1
4,9000005,1
...,...,...
1700,9001701,1
1701,9001702,1
1702,9001703,1
1703,9001704,1


In [20]:
df2.sort_values('count', axis=0, ascending=False, inplace=True)
df2[0:10]

Unnamed: 0,auth_id,count
402,9000403,2
568,9000569,2
332,9000333,2
0,9000001,1
1136,9001137,1
1145,9001146,1
1144,9001145,1
1143,9001144,1
1142,9001143,1
1141,9001142,1


In [21]:
df2.to_excel("./output/101_Output_Grouped.xlsx", sheet_name='Grouped')

Core version: 8.1.2
Pillow version: 8.1.0


FileNotFoundError: [Errno 2] No such file or directory: './output/101_Output_Grouped.xlsx'