# Data Loading, etc

## AAPI Set

Was created via census, all of the ethicities recorded 2020. Added hierarchical taxonomy terms

In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls /content/drive/MyDrive/Cultural-Analytics

aapiGroups.pkl		       mixed.000000006.jsonl.gz
Ethnicities_Kaiona_Cult.ipynb  mixed.000000007.jsonl.gz
kevin_sandbox.ipynb	       mixed.000000008.jsonl.gz
mixed.000000000.jsonl.gz       mixed.000000009.jsonl.gz
mixed.000000001.jsonl.gz       mixed.000000010.jsonl.gz
mixed.000000002.jsonl.gz       notebookEDA.ipynb
mixed.000000003.jsonl.gz       notebook_kaiona.ipynb
mixed.000000004.jsonl.gz       Untitled
mixed.000000005.jsonl.gz


getting my asian keyword set (made in other nb)

In [3]:
# set
import pickle

file_path = '/content/drive/MyDrive/Cultural-Analytics/aapiGroups.pkl'

try:
    with open(file_path, 'rb') as f:
        loaded_data = pickle.load(f)
    aapi_groups_set = set(loaded_data)
    print(f"the set has {len(aapi_groups_set)} elements")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found")

the set has 66 elements


In [4]:
list(aapi_groups_set)[:5]

['melanesian', 'south asian', 'pakistani', 'southeast-asian', 'indonesian']

proccessed via dolma, 1M sample of the web data

In [6]:
import pandas as pd
import os

directory_path = '/content/drive/MyDrive/Cultural-Analytics/'

# holds individual df
all_dfs = []

# going through all files in the dir
for filename in os.listdir(directory_path):
    if filename.endswith('00000.jsonl.gz'):
        file_path = os.path.join(directory_path, filename)
        try:
            # read into pandas
            df = pd.read_json(file_path, compression='gzip', lines=True)
            all_dfs.append(df)
            print(f"Successfully loaded '{filename}' with {len(df)} rows.")
        except FileNotFoundError:
            print(f"Error: The file '{filename}' was not found. Skipping.")
        except Exception as e:
            print(f"An error occurred while loading '{filename}': {e}")

# concat
if all_dfs:
    df_c4_sample = pd.concat(all_dfs, ignore_index=True)
    print(f"\nSuccessfully concatenated all files into df_c4_full with {len(df_c4_sample)} rows and {len(df_c4_sample.columns)} columns.")
    # disp
    print("Head of df_c4_full:")
    display(df_c4_sample.head())
else:
    print("No .jsonl.gz files were found or loaded.")

Successfully loaded 'mixed.000000000.jsonl.gz' with 50000 rows.

Successfully concatenated all files into df_c4_full with 50000 rows and 5 columns.
Head of df_c4_full:


Unnamed: 0,id,text,source,aapi_score,aapi_spans
0,46527e7290971649da9c3c7f4b06973a,membership fees and with many users renewing t...,c4,1,"[{'start': 0, 'end': 1888, 'type': 'aapi_keywo..."
1,02b2ce66b5fcefb1855519e62934e7f0,Jomsom Muktinath Trek is an exciting trek in t...,c4,1,"[{'start': 0, 'end': 1442, 'type': 'aapi_keywo..."
2,314c7898da0aa284c75aeb268e16a78a,So many things happening this weekend!\nThe hi...,c4,1,"[{'start': 0, 'end': 699, 'type': 'aapi_keywor..."
3,c8788690c723fc64bbbeabdcac706082,Sellers who are non-residents (eg. foreigners ...,c4,1,"[{'start': 0, 'end': 1250, 'type': 'aapi_keywo..."
4,3ade6ffb1c5a72436505305600bdf87f,Aman Ullah is with Ro Nay San Lwin and 93 othe...,c4,2,"[{'start': 0, 'end': 7166, 'type': 'aapi_keywo..."


In [None]:
import gzip
import json
import spacy
from tqdm.auto import tqdm

nlp = spacy.load("en_core_web_sm")

input_path = "/content/drive/MyDrive/Cultural-Analytics/mixed.000000000.jsonl.gz"
output_path = "tokens.jsonl"

with gzip.open(input_path, "rt", encoding="utf-8") as f_in, open(output_path, "w") as f_out:

    # stream texts
    texts = (json.loads(line)["text"] for line in f_in)

    # process in spaCy batches
    for doc in tqdm(nlp.pipe(texts, batch_size=100), desc="tokenizing"):

        tokens = [t.text for t in doc]

        f_out.write(json.dumps({"tokens": tokens}) + "\n")


tokenizing: 0it [00:00, ?it/s]

In [8]:
# small subset of data that I have obtained
df_c4_sample

Unnamed: 0,id,text,source,aapi_score,aapi_spans
0,46527e7290971649da9c3c7f4b06973a,membership fees and with many users renewing t...,c4,1,"[{'start': 0, 'end': 1888, 'type': 'aapi_keywo..."
1,02b2ce66b5fcefb1855519e62934e7f0,Jomsom Muktinath Trek is an exciting trek in t...,c4,1,"[{'start': 0, 'end': 1442, 'type': 'aapi_keywo..."
2,314c7898da0aa284c75aeb268e16a78a,So many things happening this weekend!\nThe hi...,c4,1,"[{'start': 0, 'end': 699, 'type': 'aapi_keywor..."
3,c8788690c723fc64bbbeabdcac706082,Sellers who are non-residents (eg. foreigners ...,c4,1,"[{'start': 0, 'end': 1250, 'type': 'aapi_keywo..."
4,3ade6ffb1c5a72436505305600bdf87f,Aman Ullah is with Ro Nay San Lwin and 93 othe...,c4,2,"[{'start': 0, 'end': 7166, 'type': 'aapi_keywo..."
...,...,...,...,...,...
49995,0f4c61bc59e2e53292517b4de2a945b2,"Srinagar, Mar 31: Call it entertainment or bro...",c4,1,"[{'start': 0, 'end': 2478, 'type': 'aapi_keywo..."
49996,2d49dcacf1ef62e28faf84a19ba32cdf,Prof. K. P. Hewagamage servers as a senior lec...,c4,1,"[{'start': 0, 'end': 8366, 'type': 'aapi_keywo..."
49997,9e4f36a44d8b9d87fad12ab0afa8bcf5,Boundaries among the three tiers are further d...,c4,1,"[{'start': 0, 'end': 4185, 'type': 'aapi_keywo..."
49998,5508f8a7ca45b555e71e42cd7bea8f29,Yuki Kihara's 2016 work Der Papālagi (The Whit...,c4,1,"[{'start': 0, 'end': 1161, 'type': 'aapi_keywo..."


In [9]:
print(df_c4_sample['text'].iloc[3000])

exit stage of funding in 2016.
In the Asia-Pacific region, Fintech investments were concentrated in Australia, China and Singapore, and in 2015, skyrocketed to reach US$3.46b - a four-fold increase from 2014 to 2015. Asian Fintechs garnered strong support, with funding growing by 413%, whilst North American funding grew by 59%. While China has been the best performer so far, Australia is poised for exponential growth.
Frost & Sullivan’s latest study, Fintech in Australia – Trends, Forecasts and Analysis 2015 – 2020 forecasts that the Australian Fintech Sector will grow at a CAGR of 76.36% and reach A$4.2 billion by 2020; of which A$1 billion will be completely new added value to the Australian economy. In 2015, the total market size of the Australian Fintech Sector was estimated at A$247.2m. Frost & Sullivan anticipates sharp growth in the Fintech market in 2016 and 2017, followed by steady increases through to 2020.
In 2015, investments in the Australian Fintech market totalled A$438 

cool, data is obtained

# Tokenize

In [10]:
import math
import operator

from collections import Counter

import spacy
from tqdm import tqdm

In [11]:
nlp = spacy.load('en_core_web_sm')

creating fulltokens for my ethnicities ie south asian as one whole vs just getting cut off as south

In [12]:
for token_text in aapi_groups_set:
    nlp.tokenizer.add_special_case(token_text, [{'ORTH': token_text}])

print(f"Added {len(aapi_groups_set)} special cases to the tokenizer.")

Added 66 special cases to the tokenizer.


In [13]:
# sample for tesing
tiny_df = df_c4_sample.iloc[:200]

In [14]:
token_vals = nlp.pipe(df_c4_sample['text'], batch_size=50)

In [15]:
df_c4_sample['spacy_doc'] = tqdm(nlp.pipe(df_c4_sample['text'], batch_size=90), total=len(df_c4_sample))

print(df_c4_sample.head())

  0%|          | 180/50000 [00:40<3:06:20,  4.46it/s]


KeyboardInterrupt: 

In [None]:
aapi_entity_counts = Counter()

# Iterate through each Doc object in the 'spacy_doc' column
for doc in tqdm(df_c4_sample['spacy_doc'], desc="Counting AAPI entities"):
    # Extract named entities from the document
    for ent in doc.ents:
        # Convert entity text to lowercase for case-insensitive matching
        entity_text_lower = ent.text.lower()

        # Check if the extracted entity is in our aapi_groups_set
        if entity_text_lower in aapi_groups_set:
            aapi_entity_counts[entity_text_lower] += 1

print("top AAPI entity counts:")
for entity, count in aapi_entity_counts.most_common(6):
    print(f"{entity}: {count}")

In [None]:
# Qualitative quick glance
example_sentences = {}

for ethnicity in aapi_groups_set:
    # s
    found = False
    for text in df_c4_sample['text']:
        if ethnicity.lower() in text.lower():
            example_sentences[ethnicity] = text
            found = True
            break  # move to the next ethnicity after finding one example
    if not found:
        example_sentences[ethnicity] = "no example in data"

print("example sentences for each AAPI ethnicity group:")
for ethnicity, sentence in example_sentences.items():
    print(f"\n{ethnicity.capitalize()}: {sentence[:100]}...") # truncate for display if very long

# Log-odds

Creating a log odds for processing

In [None]:
def logodds(counter1, counter2, display=25):
    """calcualte log odds"""
    vocab=dict(counter1)
    vocab.update(dict(counter2))
    count1_sum=sum(counter1.values())
    count2_sum=sum(counter2.values())


    #smoothing

    ranks={}
    alpha=0.01
    alphaV=len(vocab)*alpha

    for word in vocab:

        log_odds_ratio=math.log( (counter1[word] + alpha) / (count1_sum+alphaV-counter1[word]-alpha) ) - math.log( (counter2[word] + alpha) / (count2_sum+alphaV-counter2[word]-alpha) )
        variance=1./(counter1[word] + alpha) + 1./(counter2[word] + alpha)

        ranks[word]=log_odds_ratio/math.sqrt(variance)

    sorted_x = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)

    print("Most category 1:")
    for k,v in sorted_x[:display]:
        print("%.3f\t%s" % (v,k))

    print("\nMost category 2:")
    for k,v in reversed(sorted_x[-display:]):
        print("%.3f\t%s" % (v,k))

# Synatactic relations

In [None]:
from IPython.utils.tempdir import TemporaryWorkingDirectory
from collections import Counter, defaultdict

 ## verb subject content

In [None]:
def aapi_subject_verbs(texts, aapi_groups_set):
    # This dictionary will store a Counter of verbs for each AAPI subject
    ethnicity_dict = {}

    for doc in texts:
        for token in doc:
            subject_text_lower = token.text.lower()

            # Only care about people terms
            if token.ent_type_ != "NORP" or subject_text_lower not in aapi_groups_set:
                continue

            if subject_text_lower not in ethnicity_dict:
                ethnicity_dict[subject_text_lower] = Counter()

            # (A) Verbs where group term is subject/object
            if token.head.pos_ == "VERB" and  token.head.tag_ in {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}:
                ethnicity_dict[subject_text_lower][token.head.lemma_] += 1


            # (B) Verbal modifiers directly attached to the group term
            # e.g. "Asian workers protesting", "students studying"
            for child in token.children:
                if child.pos_ == "VERB":
                    ethnicity_dict[subject_text_lower][child.lemma_] += 1

    return ethnicity_dict

Small cosine test with chinese and japanese

In [None]:
japanese_verbs = aapi_subject_verbs(df_c4_sample['spacy_doc'], {'japanese'})
print("japanese verbs counter:", japanese_verbs['japanese'])

In [None]:
chinese_verbs = aapi_subject_verbs(df_c4_sample['spacy_doc'], {'chinese'})
print("chinese verbs counter:", chinese_verbs['chinese'])

In [None]:
print("\n--- Log-odds for Verbs: Japanese vs. Chinese ---")
logodds(japanese_verbs['japanese'], chinese_verbs['chinese'], display=5)

In [None]:
all_ethnicity_verbs = aapi_subject_verbs(df_c4_sample['spacy_doc'], aapi_groups_set)

## Adjectives

In [None]:
def aapi_subject_adjs(texts, aapi_groups_set):
    ethnicity_adjs = Counter()
    ethnicity_dict = {}

    for doc in texts:
        for token in doc:
            subject_lower = token.text.lower()
            stop_adj = {"north", "south", "east", "west", "central"}

            if token.ent_type_ != "NORP" or subject_lower not in aapi_groups_set:
                continue

            if subject_lower not in ethnicity_dict:
                ethnicity_dict[subject_lower] = Counter()

            # (A) Adjectives that directly modify the noun (amod)
            # e.g. "hardworking Asian workers", "talented Japanese engineers"
            for child in token.children:
              child_val = child.lemma_.lower()
              if child.pos_ == "ADJ" and child.dep_ == "amod" and child_val not in stop_adj:
                  if any(sub in child_val for sub in stop_adj):
                      continue
                  ethnicity_dict[subject_lower][child.lemma_.lower()] += 1

            # (B) Predicate adjectives via copula (acomp / attr)
            # e.g. "Asian students are hardworking"
            head = token.head
            if head.pos_ in {"AUX", "VERB"} and token.dep_ in {"nsubj", "nsubjpass"}:
                for child in head.children:
                    if child.pos_ == "ADJ" and child.dep_ in {"acomp", "attr"} and not any(sub in child_val for sub in stop_adj):
                        ethnicity_dict[subject_lower][child.lemma_.lower()] += 1

    return ethnicity_dict


In [None]:
japanese_adj = aapi_subject_adjs(df_c4_sample['spacy_doc'], {'japanese'})
print("japanese possesion counter:", japanese_adj)

In [None]:

chinese_adj = aapi_subject_adjs(df_c4_sample['spacy_doc'], {'chinese'})
print("chinese possesion counter:", chinese_adj)

In [None]:
print("\n--- Log-odds for Adjectives: Japanese vs. Chinese ---")
logodds(japanese_adj['japanese'], chinese_adj['chinese'], display=5)

In [None]:
all_ethnicity_adjs = aapi_subject_adjs(df_c4_sample['spacy_doc'], aapi_groups_set)

# Making a function to log-odds across all AAPI

In [None]:
import numpy as np
import operator
from collections import Counter

def combine_counts(group_counters_dict):
    """
    Combines multiple Counter objects into a single Counter object.
    group_counters_dict: A dictionary where keys are group names and values are Counter objects.
    """
    combined = Counter()
    for group, counter in group_counters_dict.items():
        combined.update(counter)
    return combined

def log_odds_for_group(group_counters, target_group, display = 10):
    """
    Performs a one-vs-all log-odds comparison with z-scores for a target group
    against all other groups combined.

    Args:
        group_counters (dict): A dictionary where keys are group names (str) and
                               values are Counter objects representing word counts for that group.
        target_group (str): The name of the group to be compared.
        alpha (float): Dirichlet prior strength.

    Returns:
        list: A sorted list of tuples, each containing (word, z_score, target_group_count, other_groups_count).
              Sorted in descending order by z_score.
    """
    target_group_counters = {}

    print(group_counters.keys())

    print(target_group)

    for group in target_group:
      if group not in group_counters.keys():
        print(f"Group {group} not found in group_counters.")
        return
      else:
        target_group_counters[group] = group_counters.pop(group)

    logodds(combine_counts(target_group_counters), combine_counts(group_counters), display=display)

    return


East asia vs asia

In [None]:
log_odds_for_group(all_ethnicity_verbs.copy(), {'chinese'})

South asia vs asia

In [None]:
log_odds_for_group(all_ethnicity_adjs.copy(), {'indonesian', 'malaysian','filipino', 'singaporean','cambodian'})

#Olmo

In [None]:
# Install ai2-olmo if not already installed
!pip install ai2-olmo --quiet

# Import necessary components from ai2_olmo
from ai2_olmo import OLMo, OLMoTokenizer

print("Loading OLMo model and tokenizer from ai2_olmo...")

try:
    # Define the model path (use the allenai/OLMo-1B-hf checkpoint which is compatible)
    # Note: ai2_olmo typically expects local paths to model weights or a Hugging Face model ID.
    # For simplicity, we'll try to load a known Hugging Face model directly if ai2_olmo supports it,
    # otherwise, you might need to download weights first.
    olmo_model_path = "allenai/OLMo-1B-hf" # Using the Hugging Face compatible version

    # Load the tokenizer
    # ai2_olmo might have its own tokenizer class or expect a transformers tokenizer
    # Let's try with OLMoTokenizer if it's part of ai2_olmo for the specified path
    tokenizer = OLMoTokenizer.from_pretrained(olmo_model_path)
    print(f"Tokenizer for '{olmo_model_path}' loaded successfully.")

    # Load the model
    # ai2_olmo.OLMo.from_pretrained expects a path or model ID.
    # It often handles device placement internally or through arguments.
    model = OLMo.from_pretrained(olmo_model_path)
    print(f"Model '{olmo_model_path}' loaded successfully.")

    # Tie weights to address the Accelerate warning
    model.tie_weights()

    # Move model to GPU if available (ai2_olmo might do this automatically or offer a .to_gpu() method)
    if torch.cuda.is_available():
        model.cuda() # ai2_olmo models might have a .cuda() method or be moved with .to('cuda')
        print("Model moved to GPU.")
    else:
        print("GPU not available, model running on CPU.")

    print("OLMo model and tokenizer are ready for use.")

    # Example: Generate some text
    prompt = "The quick brown fox jumps over the lazy"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Ensure inputs are on the same device as model

    # Generate text (adjust max_new_tokens, num_beams for different generation styles)
    print("Generating text...")
    generated_tokens = model.generate(**inputs, max_new_tokens=50, num_beams=1, do_sample=True, temperature=0.7)
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    print("\n--- Generated Text ---")
    print(generated_text)
    print("----------------------")

except Exception as e:
    print(f"Error using ai2_olmo: {e}")
    print("Please ensure ai2-olmo is correctly installed and the model ID/path is valid.")

# Task
Iterate through each AAPI ethnicity group in `aapi_groups_set` and find one example sentence from the `df_c4_sample` DataFrame where that ethnicity is mentioned, then print the ethnicity and its corresponding example sentence.

## find_example_sentences_for_ethnicities

### Subtask:
Iterate through the AAPI ethnicity groups and find one example sentence from the `df_c4_sample` for each ethnicity where it is mentioned.
