In [1]:
import pandas as pd
import numpy as np

In [8]:
def extract_domain(tag):
    # Count the number of occurrences of 'Domain:'
    domain_count = tag.count('Domain:')
    
    # If there are multiple 'Domain:', mark as "DUP"
    if domain_count > 1:
        return "DUP"
    # Otherwise, extract the Domain
    else:
        match = pd.Series(tag).str.extract(r'Domain: ([\w\s]+),')
        return match.iloc[0, 0]
    
def extract_type(tag):
    print(f"Processing tag: {tag}")  # Debugging print
    type_count = tag.count('Type:')
    
    if type_count > 1:
        print("Multiple 'Type:' found.")  # Debugging print
        return "DUP"
    elif type_count == 1:
        match = pd.Series(tag).str.extract(r'Type: (.+)$')
        print(f"Extracted match: {match}")  # Debugging print
        if not match.empty and match.iloc[0, 0] is not None:
            return match.iloc[0, 0]
        else:
            print("No match found.")  # Debugging print
            return None
    else:
        print("'Type:' not found.")  # Debugging print
        return None

In [19]:
# GLOBAL VARIABLES
SEMESTER = "F2023"
drop_columns = ["Task", "Billable", "Amount ()"]
raw_read_csv = f'../data/{SEMESTER}.csv'
duplicate_types_csv = '../data/double_entry_type.csv'
duplicate_domain_csv = '../data/double_entry_domain.csv'
resave_dups = True

In [20]:
# import data file from semester
time_track_raw_df = pd.read_csv(raw_read_csv)

print(time_track_raw_df.dtypes)
print(time_track_raw_df.count())

Client          object
Project         object
Task           float64
Description     object
Billable        object
Start date      object
Start time      object
End date        object
End time        object
Duration        object
Tags            object
Amount ()      float64
dtype: object
Client         409
Project        419
Task             0
Description    419
Billable       419
Start date     419
Start time     419
End date       419
End time       419
Duration       419
Tags           419
Amount ()        0
dtype: int64


In [21]:
# remove columns
time_track_dropped = time_track_raw_df.drop(columns=drop_columns, inplace=False)

time_track_dropped.head()

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags
0,,Personal Admin Maintenance,Toggl Config,2023-08-20,12:57:48,2023-08-20,13:11:42,00:13:54,"Domain: School, Type: Implementing"
1,,Personal Admin Maintenance,Climbing Email,2023-08-20,13:12:16,2023-08-20,13:14:02,00:01:46,"Domain: School, Type: Writing"
2,F2023,SWE,Syllabus,2023-08-20,14:00:09,2023-08-20,14:11:34,00:11:25,"Domain: School, Type: Implementing, Type: Reading"
3,F2023,GenAI & Art,Banquet Slides,2023-08-21,10:57:04,2023-08-21,10:57:39,00:00:35,"Domain: School, Type: Implementing"
4,F2023,GenAI & Art,Banquet Slides,2023-08-21,11:00:43,2023-08-21,11:19:58,00:19:15,"Domain: School, Type: Implementing"


In [22]:
# typecast column duration to timedelta
time_track_dropped["Duration"] = pd.to_timedelta(time_track_dropped["Duration"])

# typecast column date to datetime
time_track_dropped["Start date"] = pd.to_datetime(time_track_dropped["Start date"])
time_track_dropped["End date"] = pd.to_datetime(time_track_dropped["End date"])

# typecast Tags
time_track_dropped['Tags'] = time_track_dropped['Tags'].astype(str)

print(time_track_dropped.dtypes)

time_track_typed = time_track_dropped

Client                  object
Project                 object
Description             object
Start date      datetime64[ns]
Start time              object
End date        datetime64[ns]
End time                object
Duration       timedelta64[ns]
Tags                    object
dtype: object


In [23]:
# separate Domain and Type

# Extracting Domain and Type into separate columns
time_track_typed['Type'] = time_track_typed['Tags'].apply(extract_type)
time_track_typed['Domain'] = time_track_typed['Tags'].apply(extract_domain)

time_track_typed.head()

Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Writing
Extracted match:          0
0  Writing
Processing tag: Domain: School, Type: Implementing, Type: Reading
Multiple 'Type:' found.
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Practice
Extracted match:           0
0  Practice
Processing tag: Domain: School, Type: Writing
Extracted match:          0
0  Writing
Processing tag: Domain: School, Type: Assessment
Extracted match:             0
0  Assessment
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processing tag: Domain: School, Type: Reading
Extracted match:          0
0  Reading
Processing tag: Domain: School, Type: Writing
Extracted match:          0
0  Writing
Pro

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags,Type,Domain
0,,Personal Admin Maintenance,Toggl Config,2023-08-20,12:57:48,2023-08-20,13:11:42,0 days 00:13:54,"Domain: School, Type: Implementing",Implementing,School
1,,Personal Admin Maintenance,Climbing Email,2023-08-20,13:12:16,2023-08-20,13:14:02,0 days 00:01:46,"Domain: School, Type: Writing",Writing,School
2,F2023,SWE,Syllabus,2023-08-20,14:00:09,2023-08-20,14:11:34,0 days 00:11:25,"Domain: School, Type: Implementing, Type: Reading",DUP,School
3,F2023,GenAI & Art,Banquet Slides,2023-08-21,10:57:04,2023-08-21,10:57:39,0 days 00:00:35,"Domain: School, Type: Implementing",Implementing,School
4,F2023,GenAI & Art,Banquet Slides,2023-08-21,11:00:43,2023-08-21,11:19:58,0 days 00:19:15,"Domain: School, Type: Implementing",Implementing,School


In [24]:
# look for double-entry tags and remove them

if (time_track_typed.loc[time_track_typed['Domain'] == "DUP"].size > 0): 
    print("WARNING: Double entry in Domain")
    print(f"\tEntries will be removed and placed into {duplicate_domain_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")

    
    # filter out entries where Domain == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Domain'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_domain_csv)


    # remove entries where Domain == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Domain'] != "DUP"]

else: 
    print("No double entries in Domain")
    # save empty dataframe to csv
    time_track_duplicated = pd.DataFrame(columns=time_track_typed.columns)
    time_track_duplicated.to_csv(duplicate_domain_csv)

if (time_track_typed.loc[time_track_typed['Type'] == "DUP"].size > 0): 
    print("Double entry in Type")
    print(f"\tEntries will be removed and placed into {duplicate_types_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")
    
    # filter out entries where Type == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Type'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_types_csv)

    # remove entries where Type == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Type'] != "DUP"]
    
else: 
    print("No double entries in Entry")
    # save empty dataframe with columns to csv
    time_track_duplicated = pd.DataFrame(columns=time_track_typed.columns)
    time_track_duplicated.to_csv(duplicate_types_csv)

No double entries in Domain
Double entry in Type
	Entries will be removed and placed into ../data/double_entry_type.csv
	The above .csv will be re-included at the beginning of the analysis script


# Post-Duplication Step

In [11]:
t = input(f"Check {duplicate_domain_csv} and {duplicate_types_csv} for double entries. Press enter to continue.")

In [25]:
# ENSURE THAT ALL DUPLICATED ENTRIES HAVE BEEN TAKEN CARE OF
print(f"All entries in {duplicate_domain_csv} and {duplicate_types_csv} will be incorporated into the final analysis")

# types
try:    
    duplicate_types_df = pd.read_csv(duplicate_types_csv)
    print(f"Found {duplicate_types_csv}. Incorporating into final analysis")
except:
    print(f"Could not find {duplicate_types_csv}. Creating empty dataframe")

    # create empty dataframe with columns of clean_df  
    duplicate_types_df = pd.DataFrame(columns=time_track_typed.columns)

# domains
try: 
    duplicate_domains_df = pd.read_csv(duplicate_domain_csv)
    print(f"Found {duplicate_domain_csv}. Incorporating into final analysis")
except: 
    print(f"Could not find {duplicate_domain_csv}. Creating empty dataframe")

    # create empty dataframe with columns of clean_df  
    duplicate_domains_df = pd.DataFrame(columns=time_track_typed.columns)

All entries in ../data/double_entry_domain.csv and ../data/double_entry_type.csv will be incorporated into the final analysis
Found ../data/double_entry_type.csv. Incorporating into final analysis
Found ../data/double_entry_domain.csv. Incorporating into final analysis


In [26]:
# combine clean and duplicates frames
combined_df = pd.concat([time_track_typed, duplicate_types_df, duplicate_domains_df], ignore_index=True)

In [27]:
# replace NaNs in type and domain with filler
combined_df['Type'] = combined_df['Type'].fillna('Not Specified')
combined_df['Domain'] = combined_df['Domain'].fillna('Not Specified')

In [28]:
# load class_info.csv into info_df
class_df = pd.read_csv('../data/ClassInfo.csv')

# cross reference classes
combined_df = combined_df.loc[combined_df['Project'].isin(class_df['Toggl Name'])]

# remove unnamed: 0 column
if 'Unnamed: 0' in combined_df.columns:
    combined_df.drop(columns=['Unnamed: 0'], inplace=True)

display(combined_df.head())

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags,Type,Domain
2,F2023,GenAI & Art,Banquet Slides,2023-08-21 00:00:00,10:57:04,2023-08-21 00:00:00,10:57:39,0 days 00:00:35,"Domain: School, Type: Implementing",Implementing,School
3,F2023,GenAI & Art,Banquet Slides,2023-08-21 00:00:00,11:00:43,2023-08-21 00:00:00,11:19:58,0 days 00:19:15,"Domain: School, Type: Implementing",Implementing,School
4,F2023,Algorithms,Pre-Quiz,2023-08-21 00:00:00,20:16:02,2023-08-21 00:00:00,20:28:49,0 days 00:12:47,"Domain: School, Type: Practice",Practice,School
5,F2023,SWE,C01A,2023-08-21 00:00:00,20:39:26,2023-08-21 00:00:00,20:46:14,0 days 00:06:48,"Domain: School, Type: Writing",Writing,School
6,F2023,SWE,C01S: OOP Survey,2023-08-21 00:00:00,20:46:53,2023-08-21 00:00:00,20:49:40,0 days 00:02:47,"Domain: School, Type: Assessment",Assessment,School


In [29]:
# save cleaned data to csv
combined_df.to_csv(f'../data/{SEMESTER}_cleaned.csv')