In [48]:
import pandas as pd
import numpy as np

In [49]:
def extract_domain(tag):
    # Count the number of occurrences of 'Domain:'
    domain_count = tag.count('Domain:')
    
    # If there are multiple 'Domain:', mark as "DUP"
    if domain_count > 1:
        return "DUP"
    # Otherwise, extract the Domain
    else:
        match = pd.Series(tag).str.extract(r'Domain: ([\w\s]+),')
        return match.iloc[0, 0]
    
def extract_type(tag):
    print(f"Processing tag: {tag}")  # Debugging print
    type_count = tag.count('Type:')
    
    if type_count > 1:
        print("Multiple 'Type:' found.")  # Debugging print
        return "DUP"
    elif type_count == 1:
        match = pd.Series(tag).str.extract(r'Type: (.+)$')
        print(f"Extracted match: {match}")  # Debugging print
        if not match.empty and match.iloc[0, 0] is not None:
            return match.iloc[0, 0]
        else:
            print("No match found.")  # Debugging print
            return None
    else:
        print("'Type:' not found.")  # Debugging print
        return None

In [50]:
# GLOBAL VARIABLES
SEMESTER = "S2024"
drop_columns = ["Task", "Billable", "Amount ()"]
raw_read_csv = f'../data/{SEMESTER}.csv'
duplicate_types_csv = '../data/double_entry_type.csv'
duplicate_domain_csv = '../data/double_entry_domain.csv'
resave_dups = True

In [51]:
# import data file from semester
time_track_raw_df = pd.read_csv(raw_read_csv)

print(time_track_raw_df.dtypes)
print(time_track_raw_df.count())

Client          object
Project         object
Task           float64
Description     object
Billable        object
Start date      object
Start time      object
End date        object
End time        object
Duration        object
Tags            object
Amount ()      float64
dtype: object
Client         345
Project        366
Task             0
Description    366
Billable       366
Start date     366
Start time     366
End date       366
End time       366
Duration       366
Tags           365
Amount ()        0
dtype: int64


In [52]:
# remove columns
time_track_dropped = time_track_raw_df.drop(columns=drop_columns, inplace=False)

time_track_dropped.head()

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags
0,,Practicum - My Practicum,Practicum Proposal,2024-01-07,19:02:44,2024-01-07,20:13:46,01:11:02,"Domain: School, Type: Research, Type: Writing"
1,,Practicum - My Practicum,Practicum Proposal,2024-01-07,20:24:29,2024-01-07,20:52:59,00:28:30,"Domain: School, Type: Research, Type: Writing"
2,,Practicum - My Practicum,Practicum Proposal,2024-01-08,09:29:40,2024-01-08,10:58:47,01:29:07,"Domain: School, Type: Research, Type: Writing"
3,,Practicum - My Practicum,Practicum Proposal,2024-01-08,11:03:51,2024-01-08,11:15:54,00:12:03,"Domain: School, Type: Research, Type: Writing"
4,S2024,Social Media and Misinformation,ClickUp Scheduling,2024-01-09,13:52:29,2024-01-09,14:46:40,00:54:11,"Domain: School, Type: Implementing"


In [53]:
# typecast column duration to timedelta
time_track_dropped["Duration"] = pd.to_timedelta(time_track_dropped["Duration"])

# typecast column date to datetime
time_track_dropped["Start date"] = pd.to_datetime(time_track_dropped["Start date"])
time_track_dropped["End date"] = pd.to_datetime(time_track_dropped["End date"])

# typecast Tags
time_track_dropped['Tags'] = time_track_dropped['Tags'].astype(str)

print(time_track_dropped.dtypes)

time_track_typed = time_track_dropped

Client                  object
Project                 object
Description             object
Start date      datetime64[ns]
Start time              object
End date        datetime64[ns]
End time                object
Duration       timedelta64[ns]
Tags                    object
dtype: object


In [54]:
# separate Domain and Type

# Extracting Domain and Type into separate columns
time_track_typed['Type'] = time_track_typed['Tags'].apply(extract_type)
time_track_typed['Domain'] = time_track_typed['Tags'].apply(extract_domain)

time_track_typed.head()

Processing tag: Domain: School, Type: Research, Type: Writing
Multiple 'Type:' found.
Processing tag: Domain: School, Type: Research, Type: Writing
Multiple 'Type:' found.
Processing tag: Domain: School, Type: Research, Type: Writing
Multiple 'Type:' found.
Processing tag: Domain: School, Type: Research, Type: Writing
Multiple 'Type:' found.
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Implementing
Extracted match:               0
0  Implementing
Processing tag: Domain: School, Type: Assessment
Extracted match:             0
0  Assessment
Processing tag: Domain: School, Type: Reading
Extracted match:

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags,Type,Domain
0,,Practicum - My Practicum,Practicum Proposal,2024-01-07,19:02:44,2024-01-07,20:13:46,0 days 01:11:02,"Domain: School, Type: Research, Type: Writing",DUP,School
1,,Practicum - My Practicum,Practicum Proposal,2024-01-07,20:24:29,2024-01-07,20:52:59,0 days 00:28:30,"Domain: School, Type: Research, Type: Writing",DUP,School
2,,Practicum - My Practicum,Practicum Proposal,2024-01-08,09:29:40,2024-01-08,10:58:47,0 days 01:29:07,"Domain: School, Type: Research, Type: Writing",DUP,School
3,,Practicum - My Practicum,Practicum Proposal,2024-01-08,11:03:51,2024-01-08,11:15:54,0 days 00:12:03,"Domain: School, Type: Research, Type: Writing",DUP,School
4,S2024,Social Media and Misinformation,ClickUp Scheduling,2024-01-09,13:52:29,2024-01-09,14:46:40,0 days 00:54:11,"Domain: School, Type: Implementing",Implementing,School


In [55]:
# look for double-entry tags and remove them

if (time_track_typed.loc[time_track_typed['Domain'] == "DUP"].size > 0): 
    print("WARNING: Double entry in Domain")
    print(f"\tEntries will be removed and placed into {duplicate_domain_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")

    
    # filter out entries where Domain == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Domain'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_domain_csv)


    # remove entries where Domain == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Domain'] != "DUP"]

else: 
    print("No double entries in Domain")
    # save empty dataframe to csv
    time_track_duplicated = pd.DataFrame(columns=time_track_typed.columns)
    time_track_duplicated.to_csv(duplicate_domain_csv)

if (time_track_typed.loc[time_track_typed['Type'] == "DUP"].size > 0): 
    print("Double entry in Type")
    print(f"\tEntries will be removed and placed into {duplicate_types_csv}")
    print("\tThe above .csv will be re-included at the beginning of the analysis script")
    
    # filter out entries where Type == DUP and put in its own csv
    if (resave_dups):
        time_track_duplicated = time_track_typed.loc[time_track_typed['Type'] == "DUP"]
        time_track_duplicated.to_csv(duplicate_types_csv)

    # remove entries where Type == DUP
    time_track_typed = time_track_typed.loc[time_track_typed['Type'] != "DUP"]
    
else: 
    print("No double entries in Entry")
    # save empty dataframe with columns to csv
    time_track_duplicated = pd.DataFrame(columns=time_track_typed.columns)
    time_track_duplicated.to_csv(duplicate_types_csv)

No double entries in Domain


Double entry in Type
	Entries will be removed and placed into ../data/double_entry_type.csv
	The above .csv will be re-included at the beginning of the analysis script


# Post-Duplication Step

In [56]:
t = input(f"Check {duplicate_domain_csv} and {duplicate_types_csv} for double entries. Press enter to continue.")

In [57]:
# ENSURE THAT ALL DUPLICATED ENTRIES HAVE BEEN TAKEN CARE OF
print(f"All entries in {duplicate_domain_csv} and {duplicate_types_csv} will be incorporated into the final analysis")

# types
try:    
    duplicate_types_df = pd.read_csv(duplicate_types_csv)
    print(f"Found {duplicate_types_csv}. Incorporating into final analysis")
except:
    print(f"Could not find {duplicate_types_csv}. Creating empty dataframe")

    # create empty dataframe with columns of clean_df  
    duplicate_types_df = pd.DataFrame(columns=time_track_typed.columns)

# domains
try: 
    duplicate_domains_df = pd.read_csv(duplicate_domain_csv)
    print(f"Found {duplicate_domain_csv}. Incorporating into final analysis")
except: 
    print(f"Could not find {duplicate_domain_csv}. Creating empty dataframe")

    # create empty dataframe with columns of clean_df  
    duplicate_domains_df = pd.DataFrame(columns=time_track_typed.columns)

All entries in ../data/double_entry_domain.csv and ../data/double_entry_type.csv will be incorporated into the final analysis
Found ../data/double_entry_type.csv. Incorporating into final analysis
Found ../data/double_entry_domain.csv. Incorporating into final analysis


In [58]:
# combine clean and duplicates frames
combined_df = pd.concat([time_track_typed, duplicate_types_df, duplicate_domains_df], ignore_index=True)

In [59]:
# replace NaNs in type and domain with filler
combined_df['Type'] = combined_df['Type'].fillna('Not Specified')
combined_df['Domain'] = combined_df['Domain'].fillna('Not Specified')

In [60]:
# load class_info.csv into info_df
class_df = pd.read_csv('../data/ClassInfo.csv')
display(class_df)

Unnamed: 0,Semester,Year,Professor,Department Code,Department Number,Class Name,Toggl Name,Credits
0,Fall,2023,Mehmet Belviranli,CSCI,442,Operating Systems,OS,3
1,Fall,2023,Dinesh Mehta,CSCI,406,Algorithms,Algorithms,3
2,Fall,2023,Kathleen Kelly,CSCI,306,Software Engineering,SWE,3
3,Fall,2023,"Lincoln Carr,Courtney Halls",HNRS,435A,Generative AI and Graphic Novels,GenAI & Art,3
4,Spring,2023,Renee Falconer,CHGN,121,Principles of Chemistry I,Chemistry 1,4
5,Spring,2023,Qi Han,CSCI,341,Computer Organization,Computer Organization,3
6,Spring,2023,Amelia Read,CSCI,403,Database Management,Database Management,3
7,Spring,2023,Melanie Brandt,HNRS,315,Explorations in Modern World,ExpModernWorld,3
8,Spring,2023,Jonathan Cullison,HASS,327,Music Technology,Music Technology,3
9,Spring,2024,Dinesh Mehta,CSCI,498D,Network Sciences,Network Sciences,3


In [61]:
# cross reference classes
combined_df = combined_df.loc[combined_df['Project'].isin(class_df['Toggl Name'])]

# remove unnamed: 0 column
if 'Unnamed: 0' in combined_df.columns:
    combined_df.drop(columns=['Unnamed: 0'], inplace=True)

display(combined_df.head())

Unnamed: 0,Client,Project,Description,Start date,Start time,End date,End time,Duration,Tags,Type,Domain
0,S2024,Social Media and Misinformation,ClickUp Scheduling,2024-01-09 00:00:00,13:52:29,2024-01-09 00:00:00,14:46:40,0 days 00:54:11,"Domain: School, Type: Implementing",Implementing,School
1,S2024,Cryptography,ClickUp Scheduling,2024-01-09 00:00:00,14:45:00,2024-01-09 00:00:00,15:07:24,0 days 00:22:24,"Domain: School, Type: Implementing",Implementing,School
2,S2024,Cryptography,Crypt Zotero,2024-01-10 00:00:00,09:35:58,2024-01-10 00:00:00,09:47:46,0 days 00:11:48,"Domain: School, Type: Implementing",Implementing,School
3,S2024,Network Sciences,Network Zotero,2024-01-10 00:00:00,14:58:06,2024-01-10 00:00:00,14:58:19,0 days 00:00:13,"Domain: School, Type: Implementing",Implementing,School
4,S2024,Network Sciences,Network Zotero,2024-01-10 00:00:00,15:00:20,2024-01-10 00:00:00,15:14:20,0 days 00:14:00,"Domain: School, Type: Implementing",Implementing,School


In [62]:
print(combined_df['Project'].unique())

['Social Media and Misinformation' 'Cryptography' 'Network Sciences'
 'Beyond CPUs' 'Practicum - My Practicum']


In [63]:
# save cleaned data to csv
combined_df.to_csv(f'../data/{SEMESTER}_cleaned.csv')