### This script converts the Excel report exported from xPortal2005 fingerprint scanning machine into tidy data for payroll purposes

In [1]:
import pandas as pd

In [2]:
df = pd.concat(pd.read_excel('Attendance report.xls', sheet_name=None, skiprows=3)).reset_index()



In [3]:
df.head()

Unnamed: 0,level_0,level_1,Day,Date,Unnamed: 2,CardNo,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,...,Dept,Unnamed: 32,Div,Unnamed: 34,Unnamed: 35,Job Title,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,Sheet1,0,,,,Work-Recorded / Ammended,,,,,...,,Summary,,,,,,,,
1,Sheet1,1,,,,,IN,,,OUT,...,,,,,,,,,,
2,Sheet1,2,Wednesday,2019/08/28,,4,,,,,...,,Managemen,,N/Available,,Executive,,,,
3,Sheet1,3,,,,,00:00,,,00:00,...,,Absent ...,,,,,,,,
4,Sheet1,4,Wednesday,2019/08/28,,35,,,,,...,,Managemen,,N/Available,,Executive,,,,


### Get time in, time out x 4

In [4]:
def get_tito_1():
    _df = df[['Unnamed: 4', 'Unnamed: 7']].dropna().reset_index(drop=True)
    return _df[~_df['Unnamed: 4'].str.contains('IN')].reset_index(drop=True).rename(columns={'Unnamed: 4': 'In (1)', 'Unnamed: 7': 'Out (1)'})

tito_1 = get_tito_1()

def get_tito_2():
    _df = df[['Staff No', 'Unnamed: 12']].dropna().reset_index(drop=True)
    return _df[~_df['Staff No'].str.contains('IN')].reset_index(drop=True).rename(columns={'Staff No': 'In (2)', 'Unnamed: 12': 'Out (2)'})

tito_2 = get_tito_2()

def get_tito_3():
    _df = df[['Unnamed: 15', 'Unnamed: 18']].dropna().reset_index(drop=True)
    return _df[~_df['Unnamed: 15'].str.contains('OUT')].reset_index(drop=True).rename(columns={'Unnamed: 15': 'Out (3)', 'Unnamed: 18': 'In (3)'})

tito_3 = get_tito_3()

def get_tito_4():
    _df = df[['Unnamed: 20', 'Unnamed: 22']].dropna().reset_index(drop=True)
    return _df[~_df['Unnamed: 20'].str.contains('OUT')].reset_index(drop=True).rename(columns={'Unnamed: 20': 'Out (4)', 'Unnamed: 22': 'In (4)'})

tito_4 = get_tito_4()

### Get the rest of the columns

In [5]:
def get_staff_no():
    _df = df[['Staff No']].dropna().reset_index(drop=True)
    return _df[_df['Staff No'].str.contains('JL')].reset_index(drop=True)

staff_no = get_staff_no()

def get_total():
    _df = df[['Unnamed: 24']].dropna().rename(columns={'Unnamed: 24': 'Total'})
    _df = _df[~_df['Total'].str.contains('Total')]
    return _df[~_df['Total'].str.contains('Work Hour')].reset_index(drop=True)

total = get_total()

def get_branch_normal():
    _df = df[['Branch']].dropna()
    _df = _df[~_df['Branch'].str.contains('Normal')].reset_index(drop=True)
    return _df.iloc[0::2].reset_index(drop=True), _df.iloc[1::2].reset_index(drop=True).rename(columns={'Branch': 'Normal'})

branch, normal = get_branch_normal()

def get_ot():
    _df = df[['Unnamed: 29']].dropna().reset_index(drop=True)
    return _df[~_df['Unnamed: 29'].str.contains('OT')].reset_index(drop=True).rename(columns={'Unnamed: 29': 'OT'})

ot = get_ot()

def get_division_summary():
    _df = df[['Unnamed: 32']].dropna().reset_index(drop=True)
    _df = _df[~_df['Unnamed: 32'].str.contains('Summary')].reset_index(drop=True)
    return _df.iloc[0::2].reset_index(drop=True).rename(columns={'Unnamed: 32': 'Department'}), _df.iloc[1::2].reset_index(drop=True).rename(columns={'Unnamed: 32': 'Summary'})

division, summary = get_division_summary()

In [6]:
day_date_cardno = df[['Day', 'Date', 'CardNo']].dropna().reset_index(drop=True)
staff_name = df[['Staff Name']].dropna().reset_index(drop=True)
status = df[['Unnamed: 34']].dropna().reset_index(drop=True).rename(columns={'Unnamed: 34': 'Status'})
job_title = df[['Job Title']].dropna().reset_index(drop=True)

### Concat into final df, clean strings and groupby for export into csv files

In [7]:
final_df = pd.concat([day_date_cardno, staff_no, staff_name, division, job_title, branch, normal, summary, status, 
           tito_1, tito_2, tito_3, tito_4, total, ot], axis=1)

In [8]:
# Strip away white space and remove / character (for exporting to CSV)
final_df['Staff Name'] = final_df['Staff Name'].str.strip().str.replace('/', '')

In [9]:
pd.set_option('display.max_colwidth', -1)

In [10]:
final_df

Unnamed: 0,Day,Date,CardNo,Staff No,Staff Name,Department,Job Title,Branch,Normal,Summary,...,In (1),Out (1),In (2),Out (2),Out (3),In (3),Out (4),In (4),Total,OT
0,Wednesday,2019/08/28,4,JL004,R.Thiagarajan AL S.Ramasamy,Managemen,Executive,HQ,00:00,Absent,...,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00
1,Wednesday,2019/08/28,35,JL035,Muhammad Fikrie Hadi Bin Samson Azhar,Managemen,Executive,HQ,00:00,Absent,...,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00,00:00
2,Wednesday,2019/08/28,20,JL020,Muhd Rasyid Bin Asiab,Managemen,N/Available,HQ,07:32,EarlyIn+EarlyOut,...,06:48,17:02,00:00,00:00,00:00,00:00,00:00,00:00,07:32,00:00
3,Wednesday,2019/08/28,40,JL040,YUSOF,Managemen,Executive,HQ,07:41,EarlyIn+EarlyOut+EarlyInFromLunch+InCompleteLunch,...,06:51,17:11,00:00,00:00,00:00,13:04,00:00,00:00,07:41,00:00
4,Wednesday,2019/08/28,29,JL029,Sanawiah,Admin,G.Clerk,HQ,07:34,EarlyIn+EarlyOut,...,07:28,17:04,00:00,00:00,00:00,00:00,00:00,00:00,07:34,00:00
5,Wednesday,2019/08/28,14,JL014,Anboalagan AL Suppiah,Managemen,G.Clerk,HQ,07:05,EarlyIn+EarlyOut,...,07:35,16:35,00:00,00:00,00:00,00:00,00:00,00:00,07:05,00:00
6,Wednesday,2019/08/28,33,JL033,Ahmad Azizi Bin Mawal,Managemen,G.Clerk,HQ,08:00,EarlyIn+LateOut,...,07:47,19:00,00:00,00:00,00:00,00:00,00:00,00:00,09:30,00:00
7,Wednesday,2019/08/28,2,JL002,Kieu Gee Vour,Managemen,Manager,HQ,08:00,EarlyIn+LateOut+EarlyInFromLunch+InCompleteLunch,...,07:51,19:04,00:00,00:00,00:00,13:08,00:00,00:00,09:34,00:00
8,Wednesday,2019/08/28,6,JL006,Tham Pek Fah,Managemen,Executive,HQ,07:31,EarlyIn+EarlyOut,...,07:55,17:01,00:00,00:00,00:00,00:00,00:00,00:00,07:31,00:00
9,Wednesday,2019/08/28,28,JL028,Rommi Bin Yatim,Managemen,Executive,HQ,08:00,EarlyIn+LateOut,...,07:57,19:04,00:00,00:00,00:00,00:00,00:00,00:00,09:34,00:00


In [11]:
# Group by staff name and generate csv files 
grouped = final_df.groupby('Staff Name')

for i, (staff_name, data) in enumerate(grouped, 1):
    data.to_csv(f"{i}_{staff_name}.csv")