In [50]:
%pip install pandas numpy openpyxl

You should consider upgrading via the '/Users/henryhuang/3rd Year/research/brain-hci/team-text-analysis/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [51]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

In [52]:
# Load transcription data and CATA dictionary
tx_file = "data/CSL_Laptop_Group1_word_level_transcriptions.csv"
dict_file = "data/cata-dict.xlsx"

In [53]:
transcript_df = pd.read_csv(tx_file)
dictionary_df = pd.read_excel(dict_file, sheet_name='Marks_v2')

In [54]:
transcript_df.head()

Unnamed: 0,start,end,text,speaker
0,1.684,2.784,"Okay,",SPEAKER_00
1,5.506,5.967,nevermind.,SPEAKER_01
2,5.987,6.027,Do,SPEAKER_01
3,9.549,11.35,you,SPEAKER_01
4,18.235,18.675,see,SPEAKER_01


In [55]:
# only save the words and diction_code columns
dictionary_df = dictionary_df[['words', 'diction_code']]
dictionary_df.head()

Unnamed: 0,words,diction_code
0,(:,P33_IAM
1,i like*,P30_INTER
2,(should not) like,N30_INTER
3,they like*,P30_INTER
4,we like*,P30_INTER


In [56]:
# Extract category-wise words, handling wildcards and context-based words
category_words = defaultdict(set)  # Stores direct words (key: category, value: set of words)
category_patterns = defaultdict(list)  # Stores regex patterns and context-based words

for _, row in dictionary_df.iterrows():
    word = str(row['words']).strip().lower()  # Normalize word
    category = row['diction_code']
    
    # Skip negative diction codes
    if category[0]== 'N':
        continue

    # Handle wildcard "*"
    if '*' in word:
        regex_pattern = re.sub(r'\*', r'.*', word)  # Convert * to regex pattern
        category_patterns[category].append(re.compile(regex_pattern))
    else:
        category_words[category].add(word)  # Store as a normal word

In [57]:
# View the categories and their words
for category, words in category_words.items():
    print(f"Category: {category}, Words: {list(words)[:5]}")  # Display first 5 words for brevity

for category, patterns in category_patterns.items():
    print(f"Category: {category}, Patterns: {len(patterns)} patterns")
    for pattern in patterns[:5]:
        print(f"  Pattern: {pattern.pattern}")

Category: P33_IAM, Words: ['kind', 'optimistic', 'cuter', 'comforting', 'commiserates']
Category: P30_INTER, Words: ['peacefully', 'amity', 'dude', 'heartening', 'lovelier']
Category: P10_TRANS, Words: ['designed', 'directions', 'depiction', 'quests', 'gauge']
Category: P100_OVERALL, Words: ['write', 'situation', 'tryin', 'major', 'equilize']
Category: P21_AMP, Words: ['producing', 'achieves', 'achievement', 'updating', 'accomplishes']
Category: P20_ACT, Words: ['avails', 'refill', 'remote', 'executed', 'react']
Category: P11_TMA, Words: ['adjustable', 'routes', 'ad-libbing', 'stage', 'subject']
Category: P13_TSF, Words: ['developes', 'deadline', 'scheme', 'elements', 'schemes']
Category: P31_ICM, Words: ['moderation', 'reconciles', 'arbitrators', 'conform', 'assents']
Category: P23_ATM, Words: ['takes-care', 'takes-over', 'aids', 'conducting', 'workload-sharing']
Category: P24_ACO, Words: ['combine', 'intermixed', 'conjoined', 'grouping', 'divides']
Category: P32_IMO, Words: ['gorgeou

In [58]:
# Preprocess transcript
transcript_df = transcript_df[['start', 'end', 'text', 'speaker']].dropna()
transcript_df['text'] = transcript_df['text'].str.lower().apply(lambda x: re.findall(r'\b\w+\b', x))
transcript_df.head()

Unnamed: 0,start,end,text,speaker
0,1.684,2.784,[okay],SPEAKER_00
1,5.506,5.967,[nevermind],SPEAKER_01
2,5.987,6.027,[do],SPEAKER_01
3,9.549,11.35,[you],SPEAKER_01
4,18.235,18.675,[see],SPEAKER_01


In [62]:
# Define window params
window_size = 30    # 30-second window
step_size   = 15    # 15-second overlap step
max_time    = transcript_df['end'].max()

In [70]:
# Create time series data structures
time_points = np.arange(0, max_time, step_size)
speaker_time_series = []  # Will hold (speaker, window_start, window_end, category_counts...)
all_categories = set(category_words.keys()) | set(category_patterns.keys())
all_categories

{'P100_OVERALL',
 'P10_TRANS',
 'P11_TMA',
 'P12_TGS',
 'P13_TSF',
 'P20_ACT',
 'P21_AMP',
 'P22_ASM',
 'P23_ATM',
 'P24_ACO',
 'P30_INTER',
 'P31_ICM',
 'P32_IMO',
 'P33_IAM'}

In [77]:
for t in time_points:
    window_start = t
    window_end   = t + window_size
    
    # Filter transcript rows overlapping this window
    # Condition: utterance overlaps if start < window_end and end >= window_start
    df_window = transcript_df[
        (transcript_df['start'] < window_end) &
        (transcript_df['end']   >= window_start)
    ]
    
    # We'll keep track of counts for each speaker → each category
    # e.g., speaker_category[speaker][category] = count
    speaker_category = defaultdict(lambda: defaultdict(int))
    
    # For each row, tokenize and match words
    for _, row in df_window.iterrows():
        speaker = row['speaker']
        
        # Tokenize text (lowercase, alphanumeric)
        tokens = re.findall(r'\b\w+\b', str(row['text']).lower())
        
        # Match direct words
        for cat, words_set in category_words.items():
            for token in tokens:
                if token in words_set:
                    speaker_category[speaker][cat] += 1
        
        # Match wildcard/regex patterns
        for cat, pattern_list in category_patterns.items():
            for pat in pattern_list:
                for token in tokens:
                    if pat.match(token):
                        speaker_category[speaker][cat] += 1
    
    # For each speaker we found in the current window, create a row
    # that includes category counts for all categories.
    # If a speaker has zero for a category, it won't appear in speaker_category,
    # so we must fill in 0 for missing categories.
    for speaker, cat_counts in speaker_category.items():
        row_dict = {
            'speaker': speaker,
            'window_start': window_start,
            'window_end':   window_end
        }
        # Make sure all categories appear
        for cat in all_categories:
            row_dict[cat] = cat_counts.get(cat, 0)

        speaker_time_series.append(row_dict)

In [73]:
speaker_time_series_df = pd.DataFrame(speaker_time_series)
speaker_time_series_df.sort_values(by=['window_start','speaker'], inplace=True)
speaker_time_series_df.head()

Unnamed: 0,speaker,window_start,window_end,P100_OVERALL,P31_ICM,P21_AMP,P10_TRANS,P23_ATM,P30_INTER,P13_TSF,P32_IMO,P20_ACT,P12_TGS,P11_TMA,P24_ACO,P33_IAM,P22_ASM
0,SPEAKER_00,0.0,30.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,SPEAKER_01,0.0,30.0,1,1,0,0,0,1,0,0,1,0,0,0,0,0
2,SPEAKER_01,15.0,45.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,SPEAKER_01,30.0,60.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,SPEAKER_03,30.0,60.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [75]:
# We can group by (window_start, window_end) and sum across speakers
speaker_agg = lambda x: ', '.join(sorted(x.unique()))

group_time_series_df = speaker_time_series_df.groupby(['window_start', 'window_end']).agg(
    {**{col: 'sum' for col in speaker_time_series_df.columns if col not in ['window_start', 'window_end', 'speaker']},
     'speaker': speaker_agg}
).reset_index()

group_time_series_df.to_csv("time_series_group_level.csv", index=False)

print("Speaker-level Time Series:")
print(speaker_time_series_df.head(10))

print("\nGroup-level Time Series:")
print(group_time_series_df.head(10))

Speaker-level Time Series:
      speaker  window_start  window_end  P100_OVERALL  P31_ICM  P21_AMP  \
0  SPEAKER_00           0.0        30.0             1        0        0   
1  SPEAKER_01           0.0        30.0             1        1        0   
2  SPEAKER_01          15.0        45.0             1        0        0   
3  SPEAKER_01          30.0        60.0             0        0        0   
4  SPEAKER_03          30.0        60.0             0        0        0   
5  SPEAKER_01          45.0        75.0             5        0        1   
6  SPEAKER_03          45.0        75.0             0        0        0   
7  SPEAKER_01          60.0        90.0             5        0        1   
8  SPEAKER_01          75.0       105.0             1        0        1   
9  SPEAKER_01          90.0       120.0             1        0        1   

   P10_TRANS  P23_ATM  P30_INTER  P13_TSF  P32_IMO  P20_ACT  P12_TGS  P11_TMA  \
0          0        0          0        0        0        0       