In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/olastor/german-word-frequencies/refs/heads/main/opensubtitles/opensubtitles_cistem_freq.csv"
df = pd.read_csv(url)
# df.to_csv("opensubtitles_cistem_freq.csv", index=False)  # Save it locally if needed
df.sort_values(by="freq", ascending=False, inplace=True)


In [None]:
import sys
import os
# Add the RQ1 directory to the path
sys.path.append(os.path.abspath("../"))
from ..WordNonword.classification import WordNonwordClassifier

# model_name = "google/gemma-3-12b-it"
# model_name = "google/gemma-3-12b-pt"
# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "Tower-Babel/Babel-9B-Chat"
word_nonword_cls = WordNonwordClassifier("English", model_name) # language is not used in the model name, but it is required by the class

In [None]:
from tqdm import tqdm
tokenized_words = []
for word in tqdm(df['word']):
    if pd.isna(word):
        tokenized_words.append([])  # or ['<unk>'], or skip, depending on your use case
    else:
        word_str = str(word)
        tokens = word_nonword_cls.tokenizer.tokenize(word_str)
        tokenized_words.append(tokens)
df['word_tokens'] = tokenized_words
df.to_csv("opensubtitles_cistem_freq.csv", index=False)  # Save it locally if needed

In [None]:
import pandas as pd
df = pd.read_csv("opensubtitles_cistem_freq.csv")

In [None]:
df['word_toens_len'] = df['word_tokens'].apply(lambda x: len(ast.literal_eval(x)))
df['word_toens_len'].value_counts()

In [None]:
import stanza
nlp = stanza.Pipeline(lang="de", processors="tokenize,pos,lemma", use_gpu=True)  # Initialize Stanza pipeline

In [None]:
from tqdm import tqdm
df_single_token = df[(df['word_toens_len']==1) & (df['freq'] != 1)].reset_index(drop=True)
stanza_results = []
for word in tqdm(df_single_token['word']):
    try:
        doc = nlp(word)
        stanza_results.append(doc.sentences)
    except Exception as e:
        print(f"Error processing word '{word}': {e}")
        stanza_results.append(None)
df_single_token['stanza'] = stanza_results

In [None]:
print(df_single_token['stanza'].apply(len).value_counts())
df_single_token['upos_first'] = df_single_token['stanza'].apply(
    lambda sent_list: sent_list[0].words[0].upos if sent_list and sent_list[0].words else None
)
df_single_token = df_single_token[~df_single_token['word'].str.contains(r'[^a-zA-ZäöüÄÖÜß]', na=False)]

In [None]:
df_single_token['upos_first'].value_counts()

In [None]:
lang = "German"
model_name = "Tower-Babel/Babel-9B-Chat"
df_single_token[~df_single_token['upos_first'].isin(["PROPN","X"])].reset_index(drop=True).to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/opensubtitles_single_token_list_{model_name.split('/')[-1]}_{lang}.csv", index=False)