<a href="https://colab.research.google.com/github/j00lee/SignLingo/blob/main/Gloss_Frequency_Exploration_and_Initial_Dataset_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminary Dataset Selection
We want to investigate which gloss words appear in the English-gloss dataset so that we can focus our gloss-sign dataset on those words.

# Counting the Gloss Frequencies of English-gloss dataset

In [None]:
from collections import defaultdict
import pandas as pd

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the CSV
csv_path = '/content/drive/MyDrive/ASL Project/slt_data_clean.csv'
df = pd.read_csv(csv_path)

# Build the frequency dictionary
gloss_freq = defaultdict(int)

for sentence in df['input']:
    words = sentence.strip().split()
    for word in words:
        gloss_freq[word] += 1

# Save the frequency dictionary to CSV
# Create a DataFrame from the dictionary
gloss_freq_df = pd.DataFrame(list(gloss_freq.items()), columns=['Gloss', 'Frequency'])

# Sort by 'Frequency' column in descending order
gloss_freq_df = gloss_freq_df.sort_values(by='Frequency', ascending=False)


# Save to CSV in the same folder
save_path = '/content/drive/MyDrive/ASL Project/gloss_freq.csv'
gloss_freq_df.to_csv(save_path, index=False)

print(f"Gloss frequency dictionary saved to {save_path} ✅")

Gloss frequency dictionary saved to /content/drive/MyDrive/ASL Project/gloss_freq.csv ✅


# Counting the Gloss Frequencies of Gloss-sign Dataset

In [None]:
from collections import defaultdict
import pandas as pd

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 2: Define CSV file paths
train_path = '/content/drive/MyDrive/ASL Project/splits/train.csv'
val_path = '/content/drive/MyDrive/ASL Project/splits/val.csv'
test_path = '/content/drive/MyDrive/ASL Project/splits/test.csv'

# Step 3: Load the CSVs
train_df = pd.read_csv(train_path, sep=',')  # Your file is tab-separated, so use sep='\t'
val_df = pd.read_csv(val_path, sep=',')
test_df = pd.read_csv(test_path, sep=',')

# Debug prints
print(train_df.columns)
print(val_df.columns)
print(test_df.columns)

# Step 4: Function to build gloss frequency dictionary
def build_gloss_freq(df):
    gloss_freq = defaultdict(int)
    for gloss in df['Gloss']:
        gloss = gloss.strip()  # Keep full gloss name, no splitting
        gloss_freq[gloss] += 1
    return gloss_freq

# Step 5: Build frequency dictionaries
train_gloss_freq = build_gloss_freq(train_df)
val_gloss_freq = build_gloss_freq(val_df)
test_gloss_freq = build_gloss_freq(test_df)

# Step 6: Build combined frequency dictionary
# Merge the three dictionaries
combined_gloss_freq = defaultdict(int)

for gloss_dict in [train_gloss_freq, val_gloss_freq, test_gloss_freq]:
    for gloss, freq in gloss_dict.items():
        combined_gloss_freq[gloss] += freq

# Step 7: Save each dictionary to CSV
def save_gloss_freq(freq_dict, save_path):
    df = pd.DataFrame(list(freq_dict.items()), columns=['Gloss', 'Frequency'])
    df = df.sort_values(by='Frequency', ascending=False)
    df.to_csv(save_path, index=False)

# Paths to save
train_save_path = '/content/drive/MyDrive/ASL Project/train_gloss_freq.csv'
val_save_path = '/content/drive/MyDrive/ASL Project/val_gloss_freq.csv'
test_save_path = '/content/drive/MyDrive/ASL Project/test_gloss_freq.csv'
combined_save_path = '/content/drive/MyDrive/ASL Project/all_gloss_freq.csv'

# Save them
save_gloss_freq(train_gloss_freq, train_save_path)
save_gloss_freq(val_gloss_freq, val_save_path)
save_gloss_freq(test_gloss_freq, test_save_path)
save_gloss_freq(combined_gloss_freq, combined_save_path)

print("✅ Saved gloss frequencies for train, val, test, and combined datasets.")

Index(['Participant ID', 'Video file', 'Gloss', 'ASL-LEX Code'], dtype='object')
Index(['Participant ID', 'Video file', 'Gloss', 'ASL-LEX Code'], dtype='object')
Index(['Participant ID', 'Video file', 'Gloss', 'ASL-LEX Code'], dtype='object')
✅ Saved gloss frequencies for train, val, test, and combined datasets.


# Compare the Gloss frequencies

In [None]:
import pandas as pd
import re

# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Load both gloss frequency files
gloss_freq_path = '/content/drive/MyDrive/ASL Project/gloss_freq_exploration/gloss_freq.csv'
all_gloss_freq_path = '/content/drive/MyDrive/ASL Project/gloss_freq_exploration/all_gloss_freq.csv'

gloss_df = pd.read_csv(gloss_freq_path)
all_gloss_df = pd.read_csv(all_gloss_freq_path)

# Step 3: Prepare gloss sets
gloss_set = set(gloss_df['Gloss'])

# Process all_gloss_df to remove trailing digits
def clean_gloss(gloss):
    return re.sub(r'\d+$', '', gloss)  # remove trailing digits

# Apply cleaning
all_gloss_df['Cleaned Gloss'] = all_gloss_df['Gloss'].apply(clean_gloss)
all_gloss_set = set(all_gloss_df['Cleaned Gloss'])

# Step 4: Compare sets
common_glosses = gloss_set & all_gloss_set      # intersection
only_in_gloss_freq = gloss_set - all_gloss_set  # in gloss_freq.csv but not in all_gloss_freq.csv
only_in_all_gloss = all_gloss_set - gloss_set   # in all_gloss_freq.csv but not in gloss_freq.csv

# Step 5: Print some basic stats
print(f"Total glosses in English-gloss dataset: {len(gloss_set)}")
print(f"Total glosses in sign-video dataset: {len(all_gloss_set)}")
print(f"Common glosses: {len(common_glosses)}")
print(f"Glosses only in English-gloss dataset: {len(only_in_gloss_freq)}")
print(f"Glosses only in sign-video dataset: {len(only_in_all_gloss)}")

# (Optional) See a few examples
print("\nSome glosses only in English-gloss dataset:")
print(list(only_in_gloss_freq)[:10])

print("\nSome glosses only in sign-video dataset:")
print(list(only_in_all_gloss)[:10])


Mounted at /content/drive
Total glosses in English-gloss dataset: 4690
Total glosses in sign-video dataset: 2301
Common glosses: 1216
Glosses only in English-gloss dataset: 3474
Glosses only in sign-video dataset: 1085

Some glosses only in English-gloss dataset:
['SHE-MEET-I', 'T-T-Y', 'FIFTY', 'E-L-F', 'THANKSGIVING++', 'SUPER', 'FIRE++', 'THINK-SAME', 'LB', 'COME-TO']

Some glosses only in sign-video dataset:
['TOBACCO', 'WEWILLSEE', 'ICECREAM', 'VALIDATE', 'KING', 'CAMEL', 'STICKY', 'FROMTHENON', 'PHILOSOPHY', 'DONTMIND']


In [None]:
# === Step 6: Save to TXT files
output_folder = '/content/drive/MyDrive/ASL Project/gloss_freq_exploration/'

# Save glosses only in English-gloss dataset
with open(output_folder + 'only_in_gloss_freq.txt', 'w') as f:
    for gloss in sorted(only_in_gloss_freq):
        f.write(gloss + '\n')

# Save glosses only in sign-video dataset (after cleaning)
with open(output_folder + 'only_in_all_gloss.txt', 'w') as f:
    for gloss in sorted(only_in_all_gloss):
        f.write(gloss + '\n')

# Save common glosses
with open(output_folder + 'common_glosses.txt', 'w') as f:
    for gloss in sorted(common_glosses):
        f.write(gloss + '\n')

# We Select our Dataset
Using the 1216 shared glosses

In [None]:
import pandas as pd
import re
import os

# === Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# === Step 2: Define Paths
base_folder = '/content/drive/MyDrive/ASL Project/'
splits_folder = base_folder + 'splits/'
filtered_folder = base_folder + 'filtered splits/'
common_glosses_path = base_folder + 'gloss_freq_exploration/common_glosses.csv'

# Create the 'filtered splits' folder if it doesn't exist
os.makedirs(filtered_folder, exist_ok=True)

train_path = splits_folder + 'train.csv'
val_path = splits_folder + 'val.csv'
test_path = splits_folder + 'test.csv'

# === Step 3: Load common glosses
common_glosses_df = pd.read_csv(common_glosses_path)
common_glosses_set = set(common_glosses_df['Gloss'])

# === Step 4: Function to clean gloss (remove trailing digits)
def clean_gloss(gloss):
    return re.sub(r'\d+$', '', gloss)

# === Step 5: Function to filter a dataset
def filter_dataset(df):
    # Create a temporary column with cleaned glosses
    df['Cleaned Gloss'] = df['Gloss'].apply(clean_gloss)

    # Keep rows where cleaned gloss is in common_glosses_set
    filtered_df = df[df['Cleaned Gloss'].isin(common_glosses_set)].copy()

    # Drop the temporary 'Cleaned Gloss' column
    filtered_df = filtered_df.drop(columns=['Cleaned Gloss'])

    return filtered_df

# === Step 6: Load datasets
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# === Step 7: Filter datasets
train_filtered = filter_dataset(train_df)
val_filtered = filter_dataset(val_df)
test_filtered = filter_dataset(test_df)

# === Step 8: Save filtered datasets
train_filtered.to_csv(filtered_folder + 'train_filtered.csv', index=False)
val_filtered.to_csv(filtered_folder + 'val_filtered.csv', index=False)
test_filtered.to_csv(filtered_folder + 'test_filtered.csv', index=False)

# === Step 9: Count number of videos
print("✅ Filtered datasets saved!\n")
print("📊 Number of videos after filtering:")
print(f"Train: {len(train_filtered)} videos")
print(f"Val: {len(val_filtered)} videos")
print(f"Test: {len(test_filtered)} videos")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Filtered datasets saved!

📊 Number of videos after filtering:
Train: 21240 videos
Val: 5446 videos
Test: 17639 videos
