In [None]:
%pip install pandas numpy openpyxl

In [16]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

In [3]:
# Load transcription data and CATA dictionary
tx_file = "data/CSL_Laptop_Group1_word_level_transcriptions.csv"
dict_file = "data/cata-dict.xlsx"

In [18]:
transcription_df = pd.read_csv(tx_file)
dictionary_df = pd.read_excel(dict_file, sheet_name='Marks_v2')

In [9]:
transcription_df.head()

Unnamed: 0,start,end,text,speaker
0,1.684,2.784,"Okay,",SPEAKER_00
1,5.506,5.967,nevermind.,SPEAKER_01
2,5.987,6.027,Do,SPEAKER_01
3,9.549,11.35,you,SPEAKER_01
4,18.235,18.675,see,SPEAKER_01


In [19]:
# only save the words and diction_code columns
dictionary_df = dictionary_df[['words', 'diction_code']]
dictionary_df.head()

Unnamed: 0,words,diction_code
0,(:,P33_IAM
1,i like*,P30_INTER
2,(should not) like,N30_INTER
3,they like*,P30_INTER
4,we like*,P30_INTER


In [23]:
# Extract category-wise words, handling wildcards and context-based words
category_words = defaultdict(set)  # Stores direct words (key: category, value: set of words)
category_patterns = defaultdict(list)  # Stores regex patterns and context-based words

for _, row in dictionary_df.iterrows():
    word = str(row['words']).strip().lower()  # Normalize word
    category = row['diction_code']
    
    # Skip negative diction codes
    if category[0]== 'N':
        continue

    # Handle wildcard "*"
    if '*' in word:
        regex_pattern = re.sub(r'\*', r'.*', word)  # Convert * to regex pattern
        category_patterns[category].append(re.compile(regex_pattern))
    else:
        category_words[category].add(word)  # Store as a normal word

In [25]:
# View the categories and their words
for category, words in category_words.items():
    print(f"Category: {category}, Words: {list(words)[:5]}")  # Display first 5 words for brevity

for category, patterns in category_patterns.items():
    print(f"Category: {category}, Patterns: {len(patterns)} patterns")
    for pattern in patterns[:5]:
        print(f"  Pattern: {pattern.pattern}")

Category: P33_IAM, Words: ['kind', 'optimistic', 'cuter', 'comforting', 'commiserates']
Category: P30_INTER, Words: ['peacefully', 'amity', 'dude', 'heartening', 'lovelier']
Category: P10_TRANS, Words: ['designed', 'directions', 'depiction', 'quests', 'gauge']
Category: P100_OVERALL, Words: ['write', 'situation', 'tryin', 'major', 'equilize']
Category: P21_AMP, Words: ['producing', 'achieves', 'achievement', 'updating', 'accomplishes']
Category: P20_ACT, Words: ['avails', 'refill', 'remote', 'executed', 'react']
Category: P11_TMA, Words: ['adjustable', 'routes', 'ad-libbing', 'stage', 'subject']
Category: P13_TSF, Words: ['developes', 'deadline', 'scheme', 'elements', 'schemes']
Category: P31_ICM, Words: ['moderation', 'reconciles', 'arbitrators', 'conform', 'assents']
Category: P23_ATM, Words: ['takes-care', 'takes-over', 'aids', 'conducting', 'workload-sharing']
Category: P24_ACO, Words: ['combine', 'intermixed', 'conjoined', 'grouping', 'divides']
Category: P32_IMO, Words: ['gorgeou