# Code-Switching Predictability Analysis Pipeline


## Setup


In [1]:
!git clone https://github.com/jacobposchl/bison-word-predictability.git
!cd bison-word-predictability && pip install -r requirements.txt


Cloning into 'bison-word-predictability'...
remote: Enumerating objects: 875, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 875 (delta 56), reused 72 (delta 40), pack-reused 775 (from 1)[K
Receiving objects: 100% (875/875), 8.06 MiB | 18.17 MiB/s, done.
Resolving deltas: 100% (500/500), done.
Collecting pympi-ling>=1.71 (from -r requirements.txt (line 1))
  Downloading pympi_ling-1.71-py3-none-any.whl.metadata (3.0 kB)
Collecting pycantonese>=3.0.0 (from -r requirements.txt (line 2))
  Downloading pycantonese-3.4.0-py3-none-any.whl.metadata (6.8 kB)
Collecting python-Levenshtein>=0.21.0 (from -r requirements.txt (line 8))
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting transformers<4.57,>=4.56.2 (from -r requirements.txt (line 13))
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40

In [2]:
from google.colab import files
import shutil
from pathlib import Path


In [3]:
import os
os.chdir('bison-word-predictability')


Import any files already processed...

## 1. Preprocessing


In [None]:
!python scripts/preprocess/preprocess.py


In [None]:
preprocessing_zip = 'preprocessing_results.zip'
if Path('results/preprocessing').exists():
    shutil.make_archive('preprocessing_results', 'zip', 'results/preprocessing')
    files.download(preprocessing_zip)
    print(f"Downloaded {preprocessing_zip}")

## 2. Matching


In [4]:
!python scripts/matching/matching.py


INFO: Starting POS window matching analysis...
ERROR: File not found: Translated sentences CSV not found: results/preprocessing/cantonese_translated_WITHOUT_fillers.csv
ERROR: Please ensure preprocessing has been run first
ERROR: Run: python scripts/preprocessing/preprocess.py


In [None]:
matching_zip = 'matching_results.zip'
if Path('results/matching').exists():
    shutil.make_archive('matching_results', 'zip', 'results/matching')
    files.download(matching_zip)
    print(f"Downloaded {matching_zip}")


## 3. Surprisal Analysis


In [None]:
!python scripts/surprisal/surprisal.py --model masked


In [None]:
surprisal_zip = 'surprisal_results.zip'
if Path('results/surprisal').exists():
    shutil.make_archive('surprisal_results', 'zip', 'results/surprisal')
    files.download(surprisal_zip)
    print(f"Downloaded {surprisal_zip}")

In [None]:
!python scripts/surprisal/surprisal.py --model autoregressive


In [None]:
surprisal_zip = 'surprisal_results.zip'
if Path('results/surprisal').exists():
    shutil.make_archive('surprisal_results', 'zip', 'results/surprisal')
    files.download(surprisal_zip)
    print(f"Downloaded {surprisal_zip}")

## 4. Regression Analysis


In [None]:
!python scripts/regression/regression.py --model masked


In [None]:
regression_zip = 'regression_results.zip'
if Path('results/regression').exists():
    shutil.make_archive('regression_results', 'zip', 'results/regression')
    files.download(regression_zip)
    print(f"Downloaded {regression_zip}")

In [None]:
!python scripts/regression/regression.py --model autoregressive


In [None]:
regression_zip = 'regression_results.zip'
if Path('results/regression').exists():
    shutil.make_archive('regression_results', 'zip', 'results/regression')
    files.download(regression_zip)
    print(f"Downloaded {regression_zip}")