# coling2025_ling_analysis.ipynb
### Author: Amber Charlotte Converse
### Purpose: This file contains cells for processing text from data_master_text.csv to data files containing statistics on linguistic features.

In [None]:
# Run if running in Google Colab to link Google Drive file system
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Library installs (if not already installed)
#!pip install numpy==1.24
!pip install pandas
#!pip install -U pip setuptools wheel
!pip install spacy==3.7.6
!python3 -m spacy download en_core_web_trf
!python3 -m spacy download es_dep_news_trf
!pip install farasapy

In [None]:
# Required imports
import numpy as np
import pandas as pd
import spacy
import nltk
import time
import pickle
import re
import json
import requests
from multiprocessing import Pool
from farasa.pos import FarasaPOSTagger
from farasa.stemmer import FarasaStemmer

In [None]:
# Define file system:

path_to_data_master = ""
path_to_pickles = ""
path_to_lang_vocab_lists = ""
path_to_analysis_data = ""

In [None]:
def generate_spacy(text):
  '''
  Generates a SpaCy document from text in json format for pickling.
  Requires three global variables: i (0 at calling apply), num_rows (length of column), and start_time (time.time() at calling apply)
  These global variables are required for progress reporting.

  :param: text (string): the sentence/paragraph to be processed using SpaCy.
  :return: SpaCy document in json format
  '''
  global i
  global num_rows
  i += 1

  if i % 500 == 0:
      cur_time = time.time()
      print(f"{i / num_rows * 100}% done.")
      print(f"Estimated time remaining: {((cur_time - start_time) / i) * (num_rows - i) / 60} minutes")

  try:
      return nlp(text).to_json()
  except Exception as e:
      print(f"Error on line {i}: {e}")
      return nlp("").to_json()

In [None]:
# Run this cell to generate SpaCy documents for all columns specified in the array langs in data_master.csv
df = pd.read_csv(path_to_data_master)

nlp = spacy.load("Language model to use here") # Define SpaCy model to use, should be en_core_new_trf for English and es_dep_news_trf for Spanish

langs = ["columns", "to", "process"] # All columns to be processed (should be all one language, otherwise nlp model will fail or process incorrectly

i = 0
num_rows = len(df)

global start_time
start_time = time.time()

for lang in langs:
  i = 0
  res = list(df[lang].apply(generate_spacy))
  with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'wb') as f:
    pickle.dump(res, f)
    print(f"Saved {lang}.")

In [None]:
def count_pos(doc, parts_of_speech=[r"NOUN.*", r"VERB.*"]):
  '''
  Count parts of speech in the sentence represented by doc.

  :param: doc (SpaCy Doc): the document to be analyzed
  :param: parts_of_speech ([Str]): an array of regex strings to define parts of speech to be counted, by default counts nouns and verbs

  :return: an array of integers representing the counts of each part of speech in the sentence, respective to the order of the regex array
  '''
  if doc.text == "":
    return [None] * len(parts_of_speech)
  counts = [0] * len(parts_of_speech)
  for token in doc:
    for i, part_of_speech in enumerate(parts_of_speech):
      if re.match(part_of_speech, token.pos_):
        counts[i] += 1
  return counts

def count_lemma(doc):
  '''
  Counts the number of unique lemmas in the sentence represented by doc.

  :param: doc (SpaCy Doc): the document to be analyzed

  :return: an integer representing the number of unique lemmas in the document
  '''
  return len(set([token.lemma_ for token in doc])) if doc.text != "" else None

def count_all_lemma(doc):
  '''
  Adds all lemmas and words to sets for each language in the dictionaries lemmas and words. Used to define total number of unique lemmas and words

  :param: doc (SpaCy Doc): the document to be analyzed

  :return: None (effect is addition to the sets in lemmas)
  '''
  if doc.text == "":
    return
  for token in doc:
    text = re.sub(r"[^\w\s]", "", token.text)
    if text != "":
      lemmas[lang].add(token.lemma_.lower())
      words[lang].add(token.text.lower())

def measure_rarity(doc):
  '''
  Measure the number of tokens in the text which are rare in both a general corpus and political (genre) corpus.

  :param: doc (SpaCy Doc): the document to be analyzed

  :return: int, int: the number of rare tokens in the sentence as compared to a general corpus and the number of rare tokens in the sentence as compared to a political corpus
  '''
  general_rare_token_count = 0
  genre_rare_token_count = 0
  punct_count = 0
  for token in doc:
    text = re.sub(r"[^\w\s]", "", token.text)
    if text != "":
      if not token.text.lower() in general_common_tokens:
        general_rare_token_count += 1
      if not token.text.lower() in genre_common_tokens:
        genre_rare_token_count += 1
    else:
      punct_count += 1
  return general_rare_token_count / (len(doc) - punct_count), genre_rare_token_count / (len(doc) - punct_count)

In [None]:
# Analyze noun, verb, and lemma counts
main_lang = "es" # Define language to analyze

if main_lang == "en":
  nlp = spacy.load("en_core_web_trf")
  langs = ["en", "es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
               "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]
elif main_lang == "es":
  nlp = spacy.load("es_dep_news_trf")
  langs = ["es", "en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"]
# elif main_lang == "ar":
#   # nlp = spacy.load("ar_dep_news_trf")
#   sub_langs = ["en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

df = pd.DataFrame()

for lang in langs:
  with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'rb') as f:
    df[lang] = pickle.load(f)
  df[lang] = df[lang].apply(lambda x: spacy.tokens.Doc(nlp.vocab).from_json(x))

master_df = pd.read_csv(path_to_data_master)
out_df = pd.DataFrame(data={"id":master_df["id"]})

for lang in langs:
  noun_counts, verb_counts = zip(*list(df[lang].apply(count_pos)))
  out_df[f"{lang}_noun_counts"] = noun_counts
  out_df[f"{lang}_verb_counts"] = verb_counts
  out_df[f"{lang}_lemma_counts"] = df[lang].apply(count_lemma)

out_df.to_csv(f"{path_to_analysis_data}/{main_lang}_counts.csv", index=False)

In [None]:
# Count all unique lemmas and words in the corpus
lemmas = {lang: set() for lang in langs}
words = {lang: set() for lang in langs}

corpus_counts_df = pd.DataFrame(data={"label":["unique lemmas","unique words"]})

for lang in langs:
  df[lang].apply(count_all_lemma)

for lang in lemmas.keys():
  corpus_counts_df[lang] = [len(lemmas[lang]), len(words[lang])]

corpus_counts_df.to_csv(f"{path_to_analysis_data}/{main_lang}_corpus_unique_counts.csv", index=False)

In [None]:
# Create Vocab Files
langs = ["en","es","ar"]
for lang in langs:
  with open(f"{path_to_lang_vocab_lists}/{lang}_50k.txt", 'r') as in_file:
    with open(f"{path_to_lang_vocab_lists}/{lang}_5k.txt", 'w') as out_file:
      out_file.write("\n".join([line.split()[0] for line in in_file.read().split("\n")[:5000]]))

freqs = {"en": {}, "es": {}}
for lang in ["en","es"]:
  if lang == "en":
    sub_langs = ["en", "es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
                  "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]
  elif lang == "es":
    sub_langs = ["es", "en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"]

  for sub_lang in sub_langs:
    with open(f"{path_to_pickles}/{sub_lang}_spacy.pickle", 'rb') as f:
      docs = [spacy.tokens.Doc(nlp.vocab).from_json(doc) for doc in pickle.load(f)]

    for doc in docs:
      for token in doc:
        text = re.sub(r"[^\w\s]", "", token.text)
        if text.strip() != "":
          if not token.text.lower() in freqs[lang]:
            freqs[lang][token.text.lower()] = 0
          freqs[lang][token.text.lower()] += 1
# Sort dicts by value
for lang in freqs.keys():
  words = list({k: v for k, v in sorted(freqs[lang].items(), key=lambda item: item[1], reverse=True)}.keys())[:5000]
  with open(f"{path_to_lang_vocab_lists}/political_{lang}_5k.txt", 'w') as f:
    f.write("\n".join(words))

In [None]:
# Score rarity for sentences

# General tokens from https://github.com/hermitdave/FrequencyWords
with open(f"{path_to_lang_vocab_lists}/{main_lang}_5k.txt", 'r') as f:
  general_common_tokens = set(f.read().split("\n"))
with open(f"{path_to_lang_vocab_lists}/political_{main_lang}_5k.txt", 'r') as f:
  genre_common_tokens = set(f.read().split("\n"))

if main_lang == "en":
  langs = ["en", "es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
               "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]
elif main_lang == "es":
  langs = ["es", "en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"]

master_df = pd.read_csv(path_to_data_master)
out_df = pd.DataFrame(data={"id":master_df["id"]})

for lang in langs:
  with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'rb') as f:
    df[lang] = [spacy.tokens.Doc(nlp.vocab).from_json(doc) for doc in pickle.load(f)]

for lang in langs:
  general_rare_proportion, genre_rare_proportion = zip(*list(df[lang].apply(measure_rarity)))
  out_df[f"{lang}_general_rarity"] = general_rare_proportion
  out_df[f"{lang}_genre_rarity"] = genre_rare_proportion

out_df.to_csv(f"{path_to_analysis_data}/{main_lang}_rarity.csv", index=False)

In [None]:
# Arabic, SpaCy does not work for Arabic to the extent required, so we use Farasa instead, but it requires functions specific to Arabic

def generate_arabic_info(text):
  '''
  Generates a dictionary for a sentence in Arabic using Farasa which contains 1) the text, 2) the part of speech tagged sentence, and 3) the lemmatized sentence.
  Requires three global variables: i (0 at calling apply), num_rows (length of column), and start_time (time.time() at calling apply)
  These global variables are required for progress reporting.

  :param: text (str): the sentence/paragraph to be processed using Farasa.
  :return: dictionary containing the text, the part of speech tagged sentence, and the lemmatized sentence
  '''
  global i
  global num_rows
  i += 1

  if i % 500 == 0:
      cur_time = time.time()
      print(f"{i / num_rows * 100}% done.")
      print(f"Estimated time remaining: {((cur_time - start_time) / i) * (num_rows - i) / 60} minutes")

  info = {}
  if type(text) == float:
    return None
  else:
    payload = {"text": text, "api_key": api_key}
    info["text"] = text
    info["pos"] = pos_tagger.tag(text)
    info["lemma"] = stemmer.stem(text)
    # print(text)
    # url = "https://farasa.qcri.org/webapi/pos/"
    # print(requests.post(url, data=payload).text)
    # #info["pos"] = json.loads(requests.post(url, data=payload).text)
    # url = "https://farasa.qcri.org/webapi/lemmatization/"
    # print(requests.post(url, data=payload).text)
    # info["lemma"] = json.loads(requests.post(url, data=payload).text)
    return info

In [None]:
# Parallel Arabic Functions

def parallelize_dataframe(df, func, n_cores, lang="ar"):
    '''
    This function parallelizes a split of a Pandas dataframe for a given mapped function.

    :param: df (Pandas DataFrame): the dataframe to apply the function to
    :param: func (function): the function to be applied to the dataframe
    :param: n_cores (int): the number of cores to be used in parallelization
    :param: lang (str): the language of the text, default is "ar"

    :return: Pandas DataFrame: the dataframe with the function applied to it
    '''
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def generate_arabic_info(text):
  '''
  Generates a dictionary for a sentence in Arabic using Farasa which contains 1) the text, 2) the part of speech tagged sentence, and 3) the lemmatized sentence.

  :param: text (str): the sentence/paragraph to be processed using Farasa.
  :return: dictionary containing the text, the part of speech tagged sentence, and the lemmatized sentence
  '''
  info = {}
  if type(text) == float:
    return None
  else:
    # payload = {"text": text, "api_key": api_key}
    info["text"] = text
    info["pos"] = pos_tagger.tag(text)
    info["lemma"] = stemmer.stem(text)
    # print(text)
    # url = "https://farasa.qcri.org/webapi/pos/"
    # print(requests.post(url, data=payload).text)
    # #info["pos"] = json.loads(requests.post(url, data=payload).text)
    # url = "https://farasa.qcri.org/webapi/lemmatization/"
    # print(requests.post(url, data=payload).text)
    # info["lemma"] = json.loads(requests.post(url, data=payload).text)
    return info

def generate_arabic_info_from_df(df):
    '''
    Wrapper function for generate_arabic_info. Used to be passed to parallelize_dataframe.

    :param: df (Pandas DataFrame): the dataframe to apply the function to
    :return: Pandas DataFrame: the dataframe with the function applied to it
    '''
    global lang
    content = df[lang].map(lambda x: generate_arabic_info(x))
    df[f"{lang}_info"] = content
    return df

In [None]:
# Generate pickles for Arabic
# Parallelized, must be ran on HPC

langs = ["ar", "en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

df = pd.read_csv(path_to_data_master)

start = 0
limit = 94
total_time_start = time.time()
results = []

pos_tagger = FarasaPOSTagger()
stemmer = FarasaStemmer()

for lang in langs:
    start = 0
    limit = 94
    total_time_start = time.time()
    results = []
    while start < len(df):
        start_time = time.time()
        results.append(parallelize_dataframe(df[start:start+limit], generate_arabic_info_from_df, limit, lang=lang))
        end_time = time.time()
        print(f'Batch of data of row range {start}-{start+limit} complete in {round(end_time-start_time, 2)} seconds')
        print(f'{round(min((((start+limit) / len(df)) * 100), 100), 2)}% complete')
        start+=limit

    results_df = pd.concat(results)
    total_time_end = time.time()
    print(f'total time taken: {round(total_time_end - total_time_start,2)} second')

    with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'wb') as f:
      pickle.dump(list(results_df[f"{lang}_info"]), f)
      print(f"Saved {lang}.")

In [None]:
# Linguistic Analysis Functions overwritten for Farasa dicts

def count_pos(doc, parts_of_speech=[r"^NOUN$", r"^V$"]):
  '''
  Count parts of speech in the sentence represented by doc.

  :param: doc (dict): the document to be analyzed
  :param: parts_of_speech ([Str]): an array of regex strings to define parts of speech to be counted, by default counts nouns and verbs

  :return: an array of integers representing the counts of each part of speech in the sentence, respective to the order of the regex array
  '''
  if doc == None:
      return [None] * len(parts_of_speech)
  counts = [0] * len(parts_of_speech)
  for token in doc["pos"].split():
    for i, part_of_speech in enumerate(parts_of_speech):
      if len(token.split("/")) > 1:
        for cur_part_of_speech in token.split("/")[1].split("+"):
          if re.match(part_of_speech, cur_part_of_speech):
            counts[i] += 1
  return counts

def count_lemma(doc):
  '''
  Counts the number of unique lemmas in the sentence represented by doc.

  :param: doc (dict): the document to be analyzed

  :return: an integer representing the number of unique lemmas in the document
  '''
  return len(set([token for token in doc["lemma"].split()])) if doc != None and doc["text"] != "" else None

def count_all_lemma(doc):
  '''
  Adds all lemmas and words to sets for each language in the dictionaries lemmas and words. Used to define total number of unique lemmas and words

  :param: doc (dict): the document to be analyzed

  :return: None (effect is addition to the sets in lemmas)
  '''
  if doc == None:
    return
  tokens = nltk.word_tokenize(doc["text"])
  stems = nltk.word_tokenize(doc["lemma"])
  for token in tokens:
    text = re.sub(r"[^\w\s]", "", token)
    if text != "":
      words[lang].add(token)
  for stem in stems:
    text = re.sub(r"[^\w\s]", "", stem)
    if text != "":
      lemmas[lang].add(stem)

def measure_rarity(doc):
  '''
  Measure the number of tokens in the text which are rare in both a general corpus and political (genre) corpus.

  :param: doc (dict): the document to be analyzed

  :return: int, int: the number of rare tokens in the sentence as compared to a general corpus and the number of rare tokens in the sentence as compared to a political corpus
  '''
  if doc == None:
    return None, None
  general_rare_token_count = 0
  genre_rare_token_count = 0
  punct_count = 0
  tokens = nltk.word_tokenize(doc["text"])
  for token in tokens:
    text = re.sub(r"[^\w\s]", "", token)
    if text != "":
      if not token in general_common_tokens:
        general_rare_token_count += 1
      if not token in genre_common_tokens:
        genre_rare_token_count += 1
    else:
      punct_count += 1
  return general_rare_token_count / (len(tokens) - punct_count), genre_rare_token_count / (len(tokens) - punct_count)

In [None]:
# Count nouns, verbs, and lemmas in Arabic sentences

nltk.download("punkt")
main_lang = "ar"

if main_lang == "ar":
  langs = ["ar", "en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

df = pd.DataFrame()

for lang in langs:
  with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'rb') as f:
    docs = pickle.load(f)
    df[lang] = docs

master_df = pd.read_csv(path_to_data_master)
out_df = pd.DataFrame(data={"id":master_df["id"]})

for lang in langs:
  noun_counts, verb_counts = zip(*list(df[lang].apply(count_pos)))
  out_df[f"{lang}_noun_counts"] = noun_counts
  out_df[f"{lang}_verb_counts"] = verb_counts
  out_df[f"{lang}_lemma_counts"] = df[lang].apply(count_lemma)

out_df.to_csv(f"{path_to_analysis_data}/{main_lang}_counts.csv", index=False)

In [None]:
# Count unique lemmas and words in Arabic corpora

lemmas = {lang: set() for lang in langs}
words = {lang: set() for lang in langs}

corpus_counts_df = pd.DataFrame(data={"label":["unique lemmas","unique words"]})

for lang in langs:
  df[lang].apply(count_all_lemma)

for lang in lemmas.keys():
  corpus_counts_df[lang] = [len(lemmas[lang]), len(words[lang])]

corpus_counts_df.to_csv(f"{path_to_analysis_data}/{main_lang}_corpus_unique_counts.csv", index=False)

In [None]:
# Generate common political vocab list for Arabic

freqs = {"ar": {}}
for lang in ["ar"]:

  if main_lang == "ar":
    sub_langs = ["ar", "en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

  for sub_lang in sub_langs:
    with open(f"{path_to_pickles}/{sub_lang}_spacy.pickle", 'rb') as f:
      docs = pickle.load(f)

    for doc in docs:
      if doc == None:
        continue
      tokens = nltk.word_tokenize(doc["text"])
      for token in tokens:
        text = re.sub(r"[^\w\s]", "", token)
        if text.strip() != "":
          if not token in freqs[lang]:
            freqs[lang][token] = 0
          freqs[lang][token] += 1
# Sort dicts by value
for lang in freqs.keys():
  words = list({k: v for k, v in sorted(freqs[lang].items(), key=lambda item: item[1], reverse=True)}.keys())[:5000]
  with open(f"{path_to_lang_vocab_lists}/political_{lang}_5k.txt", 'w') as f:
    f.write("\n".join(words))

In [None]:
# Measure rarity in Arabic sentences

if main_lang == "ar":
  langs = ["ar", "en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

# General tokens from https://github.com/hermitdave/FrequencyWords
with open(f"{path_to_lang_vocab_lists}/{main_lang}_5k.txt", 'r') as f:
  general_common_tokens = set(f.read().split("\n"))
with open(f"{path_to_lang_vocab_lists}/political_{main_lang}_5k.txt", 'r') as f:
  genre_common_tokens = set(f.read().split("\n"))

df = pd.DataFrame()

for lang in langs:
  with open(f"{path_to_pickles}/{lang}_spacy.pickle", 'rb') as f:
    df[lang] = pickle.load(f)

master_df = pd.read_csv(path_to_data_master)
out_df = pd.DataFrame(data={"id":master_df["id"]})

for lang in langs:
  general_rare_proportion, genre_rare_proportion = zip(*list(df[lang].apply(measure_rarity)))
  out_df[f"{lang}_general_rarity"] = general_rare_proportion
  out_df[f"{lang}_genre_rarity"] = genre_rare_proportion

out_df.to_csv(f"{path_to_analysis_data}/{main_lang}_rarity.csv", index=False)

In [None]:
# Generate difference files for raw measurements within languages

for lang in ["en", "es", "ar"]:
  if lang == "en":
    sub_langs = ["es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
                  "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]
  if lang == "es":
    sub_langs = ["en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"]
  elif lang == "ar":
    sub_langs = ["en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

  df = pd.read_csv(f"{path_to_analysis_data}/{lang}_counts.csv")

  master_df = pd.read_csv(path_to_data_master)
  out_df = pd.DataFrame(data={"id":master_df["id"]})

  for sub_lang in sub_langs:
    out_df[f"{sub_lang}_to_{lang}_difference_noun_counts"] = df[f"{sub_lang}_noun_counts"] - df[f"{lang}_noun_counts"]
    out_df[f"{sub_lang}_to_{lang}_difference_verb_counts"] = df[f"{sub_lang}_verb_counts"] - df[f"{lang}_verb_counts"]
    out_df[f"{sub_lang}_to_{lang}_difference_lemma_counts"] = df[f"{sub_lang}_lemma_counts"] - df[f"{lang}_lemma_counts"]

  out_df.to_csv(f"{path_to_analysis_data}/{lang}_counts_difference.csv", index=False)

  df = pd.read_csv(f"{path_to_analysis_data}/{lang}_rarity.csv")

  master_df = pd.read_csv(path_to_data_master)
  out_df = pd.DataFrame(data={"id":master_df["id"]})

  for sub_lang in sub_langs:
    out_df[f"{sub_lang}_to_{lang}_difference_general_rarity"] = df[f"{sub_lang}_general_rarity"] - df[f"{lang}_general_rarity"]
    out_df[f"{sub_lang}_to_{lang}_difference_genre_rarity"] = df[f"{sub_lang}_genre_rarity"] - df[f"{lang}_genre_rarity"]

  out_df.to_csv(f"{path_to_analysis_data}/{lang}_rarity_difference.csv", index=False)

In [None]:
# Generate difference files for raw measurements between languages

for lang in ["en", "es", "ar"]:
  if lang == "en":
    sub_langs = [["en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"], \
                 ["en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]]
  elif lang == "es":
    sub_langs = [["es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS"]]
  elif lang == "ar":
    sub_langs = [["ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]]

  origin_df = pd.read_csv(f"{path_to_analysis_data}/{lang}_counts.csv")

  master_df = pd.read_csv(path_to_data_master)
  out_df = pd.DataFrame(data={"id":master_df["id"]})

  for to_langs in sub_langs:
    to_df = pd.read_csv(f"{path_to_analysis_data}/{to_langs[0].split('_')[1]}_counts.csv")

    for to_lang in to_langs:
      out_df[f"{to_lang}_difference_noun_counts"] = to_df[f"{to_lang}_noun_counts"] - origin_df[f"{lang}_noun_counts"]
      out_df[f"{to_lang}_difference_verb_counts"] = to_df[f"{to_lang}_verb_counts"] - origin_df[f"{lang}_verb_counts"]
      out_df[f"{to_lang}_difference_lemma_counts"] = to_df[f"{to_lang}_lemma_counts"] - origin_df[f"{lang}_lemma_counts"]

  out_df.to_csv(f"{path_to_analysis_data}/from_{lang}_counts_difference.csv", index=False)

  origin_df = pd.read_csv(f"{path_to_analysis_data}/{lang}_rarity.csv")

  master_df = pd.read_csv(path_to_data_master)
  out_df = pd.DataFrame(data={"id":master_df["id"]})

  for to_langs in sub_langs:
    to_df = pd.read_csv(f"{path_to_analysis_data}/{to_langs[0].split('_')[1]}_rarity.csv")

    for to_lang in to_langs:
      out_df[f"{to_lang}_to_{lang}_difference_general_rarity"] = to_df[f"{to_lang}_general_rarity"] - origin_df[f"{lang}_general_rarity"]
      out_df[f"{to_lang}_to_{lang}_difference_genre_rarity"] = to_df[f"{to_lang}_genre_rarity"] - origin_df[f"{lang}_genre_rarity"]

  out_df.to_csv(f"{path_to_analysis_data}/from_{lang}_rarity_difference.csv", index=False)

In [None]:
# Compare overall averages of differences between corpora in rarity score
main_lang = "es"

if main_lang == "en":
  langs = ["es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
               "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"]
elif main_lang == "es":
  langs = ["en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"]
elif main_lang == "ar":
  langs = ["en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]

df = pd.read_csv(f"{path_to_analysis_data}/{main_lang}_rarity.csv")

for measurement in ["general", "genre"]:
  print(f"{measurement}:")
  for lang in langs:
    print(f"{lang}: {round((df.apply(lambda row: row[f'{lang}_{measurement}_rarity'] - row[f'{main_lang}_{measurement}_rarity'], axis=1).mean()) * 100, 2)}%")
  print()

In [None]:
# Merge all files into one

langs = ["en", "es", "ar"]

count_dfs = [pd.read_csv(f"{path_to_analysis_data}/{lang}_counts.csv") for lang in langs]
rarity_dfs = [pd.read_csv(f"{path_to_analysis_data}/{lang}_rarity.csv") for lang in langs]

count_difference_within_dfs = [pd.read_csv(f"{path_to_analysis_data}/{lang}_counts_difference.csv") for lang in langs]
rarity_difference_within_dfs = [pd.read_csv(f"{path_to_analysis_data}/{lang}_rarity_difference.csv") for lang in langs]

count_difference_between_dfs = [pd.read_csv(f"{path_to_analysis_data}/from_{lang}_counts_difference.csv") for lang in langs]
rarity_difference_between_dfs = [pd.read_csv(f"{path_to_analysis_data}/from_{lang}_rarity_difference.csv") for lang in langs]

all_df_groups = [count_dfs, rarity_dfs, count_difference_within_dfs, rarity_difference_within_dfs, count_difference_between_dfs, rarity_difference_between_dfs]

linguistics_data_master_df = pd.DataFrame(data={"id":count_dfs[0]["id"]})

for df_group in all_df_groups:
  for df in df_group:
    df = df.drop_duplicates(subset="id")
    linguistics_data_master_df = pd.merge(linguistics_data_master_df, df, how="left", on="id")

linguistics_data_master_df.to_csv(f"{path_to_analysis_data}/linguistics_data_master.csv", index=False)

In [None]:
def score_general_rarity(doc):
  '''
  Measure the number of tokens in the text which are rare in a general corpus.

  :param: doc (SpaCy Doc): the document to be analyzed

  :return: int: the number of rare tokens in the sentence as compared to a general corpus
  '''
  general_rare_token_count = 0
  punct_count = 0
  for token in doc:
    text = re.sub(r"[^\w\s]", "", token.text)
    if text != "":
      if not token.text.lower() in general_common_tokens:
        general_rare_token_count += 1
    else:
      punct_count += 1
  return general_rare_token_count / (len(doc) - punct_count)

def score_general_rarity_from_df(col):
  '''
  Wrapper function for score_general_rarity. Used to be applied to a column

  :param: col (Pandas Series): the column to apply the function to
  :return: Pandas Series: the column with the function applied to it
  '''
  return col.apply(score_general_rarity)

def score_general_rarity_arabic(doc):
  '''
  Measure the number of tokens in the text which are rare in a general corpus for Arabic.

  :param: doc (dict): the document to be analyzed

  :return: int: the number of rare tokens in the sentence as compared to a general corpus
  '''
  if doc == None:
    return None
  general_rare_token_count = 0
  punct_count = 0
  tokens = nltk.word_tokenize(doc["text"])
  for token in tokens:
    text = re.sub(r"[^\w\s]", "", token)
    if text != "":
      if not token in general_common_tokens:
        general_rare_token_count += 1
    else:
      punct_count += 1
  return general_rare_token_count / (len(tokens) - punct_count)

def score_general_rarity_arabic_from_df(col):
  '''
  Wrapper function for score_general_rarity_arabic. Used to be applied to a column

  :param: col (Pandas Series): the column to apply the function to
  :return: Pandas Series: the column with the function applied to it
  '''
  return col.apply(score_general_rarity_arabic)

In [None]:
# Get average general rarity as size of common tokens list increases

nltk.download("punkt")

size_start = 100
size_end = 30000
size_step = 250

master_df = pd.read_csv(path_to_data_master)
text_df = pd.DataFrame(data={"id":master_df["id"]})
out_df = pd.DataFrame(data={"id":master_df["id"]})

associated_langs = {"en":["en", "es_en_DEEP", "es_en_DEEPL", "es_en_GOOGLE", "es_en_TRANSFORMERS", \
                  "ar_en_DEEP", "ar_en_DEEPL", "ar_en_GOOGLE", "ar_en_TRANSFORMERS"],
                    "es":["es", "en_es_DEEP", "en_es_DEEPL", "en_es_GOOGLE", "en_es_TRANSFORMERS"],
                    "ar":["ar", "en_ar_DEEP", "en_ar_DEEPL", "en_ar_GOOGLE", "en_ar_TRANSFORMERS"]}

for lang in associated_langs.keys():
  if lang == "en":
    nlp = spacy.load("en_core_web_trf")
  elif lang == "es":
    nlp = spacy.load("es_dep_news_trf")

  for sub_lang in associated_langs[lang]:
    if lang in ["en", "es"]:
      with open(f"{path_to_pickles}/{sub_lang}_spacy.pickle", 'rb') as f:
        text_df[sub_lang] = [spacy.tokens.Doc(nlp.vocab).from_json(doc) for doc in pickle.load(f)]
    else:
      with open(f"{path_to_pickles}/{sub_lang}_spacy.pickle", 'rb') as f:
        text_df[sub_lang] = pickle.load(f)


for i in range(size_start, size_end, size_step):
  for lang in ["en", "es", "ar"]:

    with open(f"{path_to_lang_vocab_lists}/{lang}_50k.txt", 'r') as f:
      general_common_tokens = set([line.split()[0] for line in f.read().split("\n")[:i]])

    scoring_function = score_general_rarity_arabic_from_df if lang == "ar" else score_general_rarity_from_df

    temp_df = text_df[associated_langs[lang]].apply(scoring_function)
    temp_df.rename(columns={sub_lang:f"{sub_lang}_general_rarity_{i}" for sub_lang in associated_langs[lang]}, inplace=True)
    temp_df["id"] = master_df["id"]
    temp_df = temp_df.drop_duplicates(subset="id")
    out_df = pd.merge(out_df, temp_df, how="left", on="id")

out_df.to_csv(f"{path_to_analysis_data}/rarity_raw_different_vocab_sizes.csv", index=False)

summary_df = pd.DataFrame(data={"val":list(range(size_start, size_end, size_step))})

for lang in ["en", "es", "ar"]:
  for sub_lang in associated_langs[lang]:
    column = []
    for i in range(size_start, size_end, size_step):
      column.append(out_df[f"{sub_lang}_general_rarity_{i}"].mean())
    summary_df[f"{sub_lang}_general_rarity"] = column

summary_df.to_csv(f"{path_to_analysis_data}/rarity_different_vocab_sizes_summary.csv", index=False)

In [None]:
df = pd.read_csv(f"{path_to_analysis_data}/en_rarity.csv")

master_df = pd.read_csv(path_to_data_master)
master_df = master_df[["id", "en", "es_en_DEEPL"]]

df["id"] = master_df["id"]

df = df[df["en_general_rarity"] == 1]

df = pd.merge(df, master_df, how="left", on="id")

In [None]:
df.to_csv("/content/drive/My Drive/coling2025/value_one_general_sentences.csv")

In [None]:
df = pd.read_csv(f"{path_to_analysis_data}/es_rarity.csv")

master_df = pd.read_csv(path_to_data_master)
master_df = master_df[["id", "en", "es", "en_es_DEEPL"]]

df["id"] = master_df["id"]

df = df[(df["es_general_rarity"] - df["en_es_DEEPL_general_rarity"]) < -0.3]

df = pd.merge(df, master_df, how="left", on="id")

In [None]:
df.to_csv("/content/drive/My Drive/coling2025/increase_in_rarity_es.csv")