In [15]:
import pandas as pd
import numpy as np
import cleaning_functions

In [16]:
def extract_domain(tag):
    # Count the number of occurrences of 'Domain:'
    domain_count = tag.count('Domain:')
    
    # If there are multiple 'Domain:', mark as "DUP"
    if domain_count > 1:
        return "DUP"
    # Otherwise, extract the Domain
    else:
        match = pd.Series(tag).str.extract(r'Domain: ([\w\s]+),')
        return match.iloc[0, 0]
    
def extract_type(tag):
    print(f"Processing tag: {tag}")  # Debugging print
    type_count = tag.count('Type:')
    
    if type_count > 1:
        print("Multiple 'Type:' found.")  # Debugging print
        return "DUP"
    elif type_count == 1:
        match = pd.Series(tag).str.extract(r'Type: (.+)$')
        print(f"Extracted match: {match}")  # Debugging print
        if not match.empty and match.iloc[0, 0] is not None:
            return match.iloc[0, 0]
        else:
            print("No match found.")  # Debugging print
            return None
    else:
        print("'Type:' not found.")  # Debugging print
        return None

In [17]:
# GLOBAL VARIABLES
drop_columns = ["Task", "Billable", "Amount ()"]
raw_read_csv = '../data/F2023.csv'
duplicate_types_csv = '../data/double_entry_type.csv'
duplicate_domain_csv = '../data/double_entry_domain.csv'
resave_dups = False

In [18]:
# import data file from semester

time_track_raw_df = pd.read_csv(raw_read_csv)

print(time_track_raw_df.dtypes)

Client          object
Project         object
Task           float64
Description     object
Billable        object
Start date      object
Start time      object
End date        object
End time        object
Duration        object
Tags            object
Amount ()      float64
dtype: object


In [19]:
# remove columns
time_track_dropped = time_track_raw_df.drop(columns=drop_columns, inplace=False)

time_track_dropped.head()

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags
0,F2023,GenAI & Art,Banquet Slides,2023-08-21,10:57:04,2023-08-21,10:57:39,00:00:35,"Domain: School, Type: Implementing"
1,F2023,GenAI & Art,Banquet Slides,2023-08-21,11:00:43,2023-08-21,11:19:58,00:19:15,"Domain: School, Type: Implementing"
2,F2023,Algorithms,Pre-Quiz,2023-08-21,20:16:02,2023-08-21,20:28:49,00:12:47,"Domain: School, Type: Practice"
3,F2023,SWE,C01A,2023-08-21,20:39:26,2023-08-21,20:46:14,00:06:48,"Domain: School, Type: Writing"
4,F2023,SWE,C01S: OOP Survey,2023-08-21,20:46:53,2023-08-21,20:49:40,00:02:47,"Domain: School, Type: Assessment"


In [20]:
# typecast column duration to timedelta
time_track_dropped["Duration"] = pd.to_timedelta(time_track_dropped["Duration"])

# typecast column date to datetime
time_track_dropped["Start date"] = pd.to_datetime(time_track_dropped["Start date"])
time_track_dropped["End date"] = pd.to_datetime(time_track_dropped["End date"])

# typecast Tags
time_track_dropped['Tags'] = time_track_dropped['Tags'].astype(str)

print(time_track_dropped.dtypes)

time_track_typed = time_track_dropped

Client                  object
Project                 object
Description             object
Start date      datetime64[ns]
Start time              object
End date        datetime64[ns]
End time                object
Duration       timedelta64[ns]
Tags                    object
dtype: object


In [21]:
# separate Domain and Type

# Extracting Domain and Type into separate columns
time_track_typed['Type'] = time_track_typed['Tags'].apply(extract_type)
time_track_typed['Domain'] = time_track_typed['Tags'].apply(extract_domain)

time_track_typed.head()

Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Practice
Extracted match:           0
0  Practice
Processing tag: Domain: School, Type: Writing
Extracted match:          0
0  Writing
Processing tag: Domain: School, Type: Assessment
Extracted match:             0
0  Assessment
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processing tag: Domain: School, Type: Writing
Extracted match:          0
0  Writing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processi

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags,Type,Domain
0,F2023,GenAI & Art,Banquet Slides,2023-08-21,10:57:04,2023-08-21,10:57:39,0 days 00:00:35,"Domain: School, Type: Implementing",Implementing,School
1,F2023,GenAI & Art,Banquet Slides,2023-08-21,11:00:43,2023-08-21,11:19:58,0 days 00:19:15,"Domain: School, Type: Implementing",Implementing,School
2,F2023,Algorithms,Pre-Quiz,2023-08-21,20:16:02,2023-08-21,20:28:49,0 days 00:12:47,"Domain: School, Type: Practice",Practice,School
3,F2023,SWE,C01A,2023-08-21,20:39:26,2023-08-21,20:46:14,0 days 00:06:48,"Domain: School, Type: Writing",Writing,School
4,F2023,SWE,C01S: OOP Survey,2023-08-21,20:46:53,2023-08-21,20:49:40,0 days 00:02:47,"Domain: School, Type: Assessment",Assessment,School


In [22]:
# look for double-entry tags and remove them

if (time_track_typed.loc[time_track_typed['Domain'] == "DUP"].size > 0): 
    print("WARNING: Double entry in Domain")
    print(f"\tEntries will be removed and placed into {duplicate_domain_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")

    
    # filter out entries where Domain == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Domain'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_domain_csv)


    # remove entries where Domain == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Domain'] != "DUP"]

if (time_track_typed.loc[time_track_typed['Type'] == "DUP"].size > 0): 
    print("Double entry in Type")
    print(f"\tEntries will be removed and placed into {duplicate_types_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")
    
    # filter out entries where Type == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Type'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_types_csv)

    # remove entries where Type == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Type'] != "DUP"]



Double entry in Type
	Entries will be removed and placed into ../data/double_entry_type.csv
	The above .csv will be re-included at the beginning of the analysis script


In [23]:
# save cleaned data to csv
time_track_typed.to_csv('../data/cleaned.csv')