# read in files

In [4]:
import pandas as pd 
import json 
from src.helpers import text2list

from collections import Counter


arxiv_fn = "../data/clean/arxiv_2018-01-01_2025-05-20_cs__.jsonl"
nyt_fn = "../data/clean/nyt_2018-01-01_2025-05-20.jsonl"


ai_terms = text2list("../data/clean/ai_terms.txt")

# all_terms = list(ai_terms.values())
# all_terms = [item for sublist in all_terms for item in sublist]
# all_terms = [x.lower() for x in all_terms if isinstance(x, str)]
# all_terms = list(set(all_terms))

atus_roles = text2list("../data/clean/atus_roles.txt")
onet_roles = text2list("../data/clean/onet_roles.txt")
nouns = ['advice', 'feedback', 'support', 'guidance', 'encouragement', 'trust', 'communication', 'interaction', 'collaboration', 'relationship', 'connection', 'understanding', 'empathy', 'mentorship', 'network', 'rapport', 'bond', 'influence', 'cooperation', 'engagement']

ai_compound_roles = text2list("../data/clean/ai_compound_roles.txt")
ai_compound_nouns = text2list("../data/clean/ai_compound_nouns.txt")

ai_compound = ai_compound_roles + ai_compound_nouns

arxiv_df = pd.read_json(arxiv_fn, lines=True).sample(frac=0.5)
arxiv_df['text'] = arxiv_df['title'] + " " + arxiv_df['abstract'] 

nyt_df = pd.read_json(nyt_fn, lines=True).sample(frac=0.5)
nyt_df['text'] = nyt_df['headline'] + " " + nyt_df['abstract'] + " " + nyt_df['snippet']


# Count words

In [5]:
from flashtext import KeywordProcessor
from collections import Counter
import pandas as pd
import swifter 

class FastFlashTextCounter:

    def __init__(self, word_lists_dict):
        self.processors = {}

        for name, word_list in word_lists_dict.items():
            processor = KeywordProcessor(case_sensitive=False)
            for word in word_list:
                processor.add_keyword(word.lower())
            self.processors[name] = processor
        print("FastFlashTextCounter initialized with word lists.")

    def count_keywords(self, text, processor_name):
        if pd.isna(text) or not text:
            return {}

        keywords_found = self.processors[processor_name].extract_keywords(str(text).lower())
        return dict(Counter(keywords_found))


word_lists = {
    'ai': ai_terms,
    'social':  atus_roles + onet_roles + nouns, 
    'ai_compound_roles': ai_compound_roles,
    'ai_compound_nouns': ai_compound_nouns
}
counter = FastFlashTextCounter(word_lists)



for name, word_list in word_lists.items():
    print(f"Counting keywords for '{name}'...")
    arxiv_df[f'{name}_word_counts'] = arxiv_df['text'].swifter.apply(lambda x: counter.count_keywords(x, name))
    arxiv_df[f'{name}_sum'] = arxiv_df[f'{name}_word_counts'].swifter.apply(lambda x: sum(x.values()))
    
    nyt_df[f'{name}_word_counts'] = nyt_df['text'].swifter.apply(lambda x: counter.count_keywords(x, name))
    nyt_df[f'{name}_sum'] = nyt_df[f'{name}_word_counts'].swifter.apply(lambda x: sum(x.values()))
    print(f"Keyword counts for '{name}' completed.")

FastFlashTextCounter initialized with word lists.
Counting keywords for 'ai'...


Python(32482) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Keyword counts for 'ai' completed.
Counting keywords for 'social'...


Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Keyword counts for 'social' completed.
Counting keywords for 'ai_compound_roles'...


Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Keyword counts for 'ai_compound_roles' completed.
Counting keywords for 'ai_compound_nouns'...


Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/302359 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/149074 [00:00<?, ?it/s]

Keyword counts for 'ai_compound_nouns' completed.


In [11]:
def sum_dicts(list_of_dicts):
    """
    Sums a list of dictionaries with the same keys.
    
    Args:
        list_of_dicts (list): List of dictionaries to sum.
        
    Returns:
        dict: A dictionary with summed values.
    """
    if not list_of_dicts:
        return {}
    
    total_counts = Counter()
    for d in list_of_dicts:
        total_counts.update(d)
    
    return sort_dict(dict(total_counts))

def sort_dict(d):
    """
    Sorts a dictionary by its values.
    
    Args:
        d (dict): Dictionary to sort.
        
    Returns:
        dict: Sorted dictionary.
    """
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=True))

arxiv_ai_compound_nouns = sum_dicts(arxiv_df['ai_compound_nouns_word_counts'].tolist())
nyt_ai_compound_nouns = sum_dicts(nyt_df['ai_compound_nouns_word_counts'].tolist())
total_ai_compound_nouns = sum_dicts([arxiv_ai_compound_nouns, nyt_ai_compound_nouns])

arxiv_ai_compound_roles = sum_dicts(arxiv_df['ai_compound_roles_word_counts'].tolist())
nyt_ai_compound_roles = sum_dicts(nyt_df['ai_compound_roles_word_counts'].tolist())
total_ai_compound_roles = sum_dicts([arxiv_ai_compound_roles, nyt_ai_compound_roles])

In [16]:
sort_dict(nyt_ai_compound_roles)

{}

In [5]:
from flashtext import KeywordProcessor
from collections import Counter
import pandas as pd


class FastFlashTextCounter:

    def __init__(self, word_lists_dict):
        self.word_processors = {}
        self.bigram_processors = {}

        for name, word_list in word_lists_dict.items():
            # Processor for single words
            word_processor = KeywordProcessor(case_sensitive=False)
            for word in word_list:
                word_processor.add_keyword(word.lower())
            self.word_processors[name] = word_processor

            # Processor for bigrams
            bigram_processor = KeywordProcessor(case_sensitive=False)
            bigrams = [f"{word_list[i]} {word_list[j]}" for i in range(len(word_list)) for j in range(i + 1, len(word_list))]
            for bigram in bigrams:
                bigram_processor.add_keyword(bigram.lower())
            self.bigram_processors[name] = bigram_processor

    def count_keywords(self, text, processor_name):
        if pd.isna(text) or not text:
            return {}

        text_lower = str(text).lower()

        word_counts = Counter(self.word_processors[processor_name].extract_keywords(text_lower))

        bigram_counts = Counter(self.bigram_processors[processor_name].extract_keywords(text_lower))

        total_counts = word_counts + bigram_counts
        return dict(total_counts)


# Example usage
word_lists = {
    'apple': ['apple', 'granny smith'],
}
counter = FastFlashTextCounter(word_lists)

sample_text = ["I had an apple that was tasty. It was a granny smith apple."]
result = counter.count_keywords(sample_text[0], 'apple')
print(result)

{'apple': 2, 'granny smith': 1}


In [None]:
# --- Method Implementations ---

# M0: Pure Python - Naive str.count (Substring, Flawed)
def count_words_M0(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    results = []
    base_dict = {word: 0 for word in word_list_to_count}
    for text in df_input['text']:
        text_str = str(text) # Ensure string
        current_counts = base_dict.copy()
        for term in word_list_to_count:
            current_counts[term] = text_str.count(term) # Substring count
        results.append(current_counts)
    return results

# M1: Pure Python - Row-wise re.findall (One Regex Per Term)
def count_words_M1(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    results = []
    base_dict = {word: 0 for word in word_list_to_count}
    # Pre-compile regexes for each term
    term_regexes = {term: re.compile(r'\b' + re.escape(term) + r'\b') for term in word_list_to_count}
    for text in df_input['text']:
        text_str = str(text)
        current_counts = base_dict.copy()
        for term, term_re in term_regexes.items():
            current_counts[term] = len(term_re.findall(text_str))
        results.append(current_counts)
    return results

# M2: Pure Python - Row-wise Single Compiled Regex
COMPILED_REGEX_M2 = re.compile(r'\b(' + '|'.join(re.escape(term) for term in WORD_LIST) + r')\b')
def count_words_M2(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    results = []
    base_dict = {word: 0 for word in word_list_to_count}
    regex_pattern = COMPILED_REGEX_M2 # Uses global WORD_LIST based pattern
    for text in df_input['text']:
        text_str = str(text)
        current_counts = base_dict.copy()
        found_words = regex_pattern.findall(text_str)
        if found_words:
            counts_in_row = Counter(found_words)
            for word, count in counts_in_row.items():
                if word in current_counts:
                    current_counts[word] = count
        results.append(current_counts)
    return results

# M3: Pandas apply - Single Compiled Regex
def count_words_M3(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    base_dict = {word: 0 for word in word_list_to_count}
    regex_pattern = COMPILED_REGEX_M2 # Uses global WORD_LIST based pattern
    
    def process_row(text):
        text_str = str(text)
        current_counts = base_dict.copy()
        found_words = regex_pattern.findall(text_str)
        if found_words:
            counts_in_row = Counter(found_words)
            for word, count in counts_in_row.items():
                if word in current_counts:
                    current_counts[word] = count
        return current_counts
        
    results = df_input['text'].apply(process_row).tolist()
    return results

# M4: Pandas Vectorized - str.findall with Single Compiled Regex (Original Best)
def count_words_M4(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    base_dict = {word: 0 for word in word_list_to_count}
    regex_pattern_str = r'\b(' + '|'.join(re.escape(term) for term in word_list_to_count) + r')\b' # word_list_to_count is already sorted by length desc
    
    all_found_words_series = df_input['text'].astype(str).str.findall(regex_pattern_str)
    
    results_list = []
    for list_of_matches_in_row in all_found_words_series:
        current_row_counts = base_dict.copy()
        if list_of_matches_in_row:
            term_counts_in_row = Counter(list_of_matches_in_row)
            for term, count in term_counts_in_row.items():
                if term in current_row_counts:
                    current_row_counts[term] = count
        results_list.append(current_row_counts)
    return results_list

# M5: Pandas Vectorized - str.count (One Regex Per Term, then combine)
def count_words_M5(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    # Create a DataFrame to hold counts for each term
    counts_df = pd.DataFrame(index=df_input.index)
    for term in word_list_to_count:
        term_pattern = r'\b' + re.escape(term) + r'\b'
        counts_df[term] = df_input['text'].astype(str).str.count(term_pattern)
    
    # Convert the counts_df to list of dicts
    results = counts_df.to_dict(orient='records')
    return results

# M6: flashtext Library
try:
    from flashtext import KeywordProcessor
    FLASHTEXT_LOADED = True
except ImportError:
    FLASHTEXT_LOADED = False
    print("FlashText not installed. Skipping M6.")

def count_words_M6(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    if not FLASHTEXT_LOADED:
        return [BASE_COUNTS_DICT.copy() for _ in range(len(df_input))] # Return dummy if not loaded

    keyword_processor = KeywordProcessor(case_sensitive=True)
    for term in word_list_to_count:
        keyword_processor.add_keyword(term, term) # Map term to itself

    results = []
    base_dict = {word: 0 for word in word_list_to_count}
    for text in df_input['text']:
        text_str = str(text)
        current_counts = base_dict.copy()
        found_terms = keyword_processor.extract_keywords(text_str) # Returns list of mapped values (terms themselves)
        if found_terms:
            counts_in_row = Counter(found_terms)
            for term, count in counts_in_row.items():
                if term in current_counts:
                    current_counts[term] = count
        results.append(current_counts)
    return results

# M7: CountVectorizer (Custom token_pattern)
try:
    from sklearn.feature_extraction.text import CountVectorizer
    SKLEARN_LOADED = True
except ImportError:
    SKLEARN_LOADED = False
    print("Scikit-learn not installed. Skipping M7.")

def count_words_M7(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    if not SKLEARN_LOADED:
         return [BASE_COUNTS_DICT.copy() for _ in range(len(df_input))]

    # word_list_to_count is already sorted by length desc globally (WORD_LIST)
    regex_for_cv_tokens = r'\b(?:' + '|'.join(re.escape(term) for term in word_list_to_count) + r')\b'
    
    cv = CountVectorizer(token_pattern=regex_for_cv_tokens, lowercase=False)
    
    # Fit and transform
    X = cv.fit_transform(df_input['text'].astype(str))
    # Get the vocabulary that CountVectorizer actually built (these are our terms if found)
    fitted_cv_vocab = cv.get_feature_names_out()
    
    results = []
    # Ensure all terms from original word_list_to_count are in each dict
    for i in range(X.shape[0]):
        row_counts = {term: 0 for term in word_list_to_count}
        doc_vector = X[i]
        if doc_vector.nnz > 0: # If any non-zero elements (terms found)
            for term_idx, count in zip(doc_vector.indices, doc_vector.data):
                term = fitted_cv_vocab[term_idx]
                if term in row_counts: # Should always be true if word_list_to_count was basis
                    row_counts[term] = count
        results.append(row_counts)
    return results


# M8: Pure Python - Tokenize then Match N-grams
def simple_tokenizer(text: str) -> list[str]:
    # Basic tokenizer, splits by space and removes empty strings
    return [token for token in text.split(' ') if token]

def count_words_M8(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    results = []
    base_dict = {word: 0 for word in word_list_to_count}
    
    # Pre-split terms in word_list
    split_word_list = {term: term.split(' ') for term in word_list_to_count}

    for text in df_input['text']:
        text_str = str(text)
        current_counts = base_dict.copy()
        # For this method, a simple split might be better to match n-grams constructed by space
        text_tokens = simple_tokenizer(text_str) 
        
        if not text_tokens: # Handle empty text
            results.append(current_counts)
            continue

        for term, term_tokens in split_word_list.items():
            n_term_tokens = len(term_tokens)
            if n_term_tokens == 0: continue

            count = 0
            for i in range(len(text_tokens) - n_term_tokens + 1):
                if text_tokens[i:i+n_term_tokens] == term_tokens:
                    count += 1
            if count > 0:
                current_counts[term] = count
        results.append(current_counts)
    return results

# --- Multiprocessing Helper ---
def process_chunk_M2(text_chunk_list: list[str]) -> list[dict[str, int]]:
    # This function will be mapped; it needs to be self-contained or use globals carefully
    # WORD_LIST and COMPILED_REGEX_M2 are global in the main process
    # For multiprocessing, it's better to pass such things or re-initialize if small
    # Here, COMPILED_REGEX_M2 is defined from global WORD_LIST.
    # Re-define for safety in new processes if needed, or ensure it's inherited.
    # Python's multiprocessing on Unix often uses fork, so globals might be available.
    # On Windows, it pickles, so globals need to be picklable or passed.
    # Let's assume it works or pass WORD_LIST if issues.

    current_regex = re.compile(r'\b(' + '|'.join(re.escape(term) for term in WORD_LIST) + r')\b')
    base_d = {word: 0 for word in WORD_LIST}
    chunk_results = []
    for text in text_chunk_list:
        text_str = str(text)
        current_row_counts = base_d.copy()
        found_words = current_regex.findall(text_str)
        if found_words:
            term_counts_in_row = Counter(found_words)
            for term, count_val in term_counts_in_row.items():
                if term in current_row_counts:
                    current_row_counts[term] = count_val
        chunk_results.append(current_row_counts)
    return chunk_results

def count_words_M9_worker(df_chunk): # M2 logic on a df chunk
    return count_words_M2(df_chunk, WORD_LIST)


def count_words_M9(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    # M2 logic (Python loop + single regex) parallelized
    text_list = df_input['text'].tolist()
    chunk_size = max(1, len(text_list) // NUM_PROCESSES)
    chunks = [text_list[i:i + chunk_size] for i in range(0, len(text_list), chunk_size)]
    
    with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
        list_of_results_chunks = pool.map(process_chunk_M2, chunks)
    
    final_results = [item for sublist in list_of_results_chunks for item in sublist]
    return final_results


def count_words_M10_worker(df_chunk): # M4 logic on a df chunk
    return count_words_M4(df_chunk, WORD_LIST)

def count_words_M10(df_input: pd.DataFrame, word_list_to_count: list[str]) -> list[dict[str, int]]:
    # M4 logic (Pandas str.findall) parallelized
    # Splitting a DataFrame for multiprocessing
    df_chunks = np.array_split(df_input, NUM_PROCESSES)
    
    with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
        list_of_results_chunks = pool.map(count_words_M10_worker, df_chunks)
        
    final_results = [item for sublist in list_of_results_chunks for item in sublist]
    return final_results

# --- Timing and Execution ---
methods_to_time = {
    "M0_Loop_StrCount": count_words_M0,
    "M1_Loop_ReFindall_PerTerm": count_words_M1,
    "M2_Loop_SingleReFindall": count_words_M2,
    "M3_PandasApply_SingleRe": count_words_M3,
    "M4_Pandas_strFindall_SingleRe": count_words_M4,
    "M5_Pandas_strCount_PerTerm": count_words_M5,
    "M8_PyLoop_TokenizeMatch": count_words_M8, # Potentially very slow
}
if FLASHTEXT_LOADED:
    methods_to_time["M6_FlashText"] = count_words_M6
if SKLEARN_LOADED:
    methods_to_time["M7_CountVectorizer"] = count_words_M7

# Multiprocessing methods added separately due to potential for long setup/run times
# or if user wants to skip them.
# For a fair comparison, the non-parallelized versions are more direct unless specifically testing parallel overhead.
# Adding them if explicitly requested or as part of a comprehensive test.
# methods_to_time["M9_MP_M2"] = count_words_M9
# methods_to_time["M10_MP_M4"] = count_words_M10


timings = {}
# To ensure correctness, let's get a reference result from one reliable method (M4) on a small subset
# And verify other methods against it (structure check done globally)
# Check results for first few rows from one method to ensure format
# results_m4_sample = count_words_M4(df_main.head(), WORD_LIST)
# check_output(results_m4_sample, WORD_LIST)


print(f"\n--- Starting Benchmark on {NUM_ROWS} rows ---")
print(f"Using WORD_LIST: {WORD_LIST}\n")

# Limit number of methods for practical timing in one go, especially slow ones
# You can uncomment methods as needed. M0, M1, M8 can be extremely slow.
# I'll run a subset that are more likely to be practical.
methods_to_run = {
    # "M0_Loop_StrCount": count_words_M0, # Likely very slow and flawed
    "M1_Loop_ReFindall_PerTerm": count_words_M1, # Likely very slow
    # "M2_Loop_SingleReFindall": count_words_M2,
    # "M3_PandasApply_SingleRe": count_words_M3,
    # "M4_Pandas_strFindall_SingleRe": count_words_M4, # Expected best
    # "M5_Pandas_strCount_PerTerm": count_words_M5,
}
if FLASHTEXT_LOADED:
    pass
    # methods_to_run["M6_FlashText"] = count_words_M6
if SKLEARN_LOADED:
    pass
    # methods_to_run["M7_CountVectorizer"] = count_words_M7
# "M8_PyLoop_TokenizeMatch": count_words_M8, # Likely very slow
# Add multiprocessing if you want to test their overhead and scaling
methods_to_run["M9_MP_M2"] = count_words_M9
methods_to_run["M10_MP_M4"] = count_words_M10


for name, method_func in methods_to_run.items():
    print(f"Timing {name}...")
    start_time = time.perf_counter()
    try:
        # Execute the method
        results = method_func(df_main, WORD_LIST)
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        timings[name] = elapsed_time
        print(f"{name} took: {elapsed_time:.4f} seconds.")
        
        # Basic validation of output structure (can be commented out for speed after first check)
        if not check_output(results, WORD_LIST):
             print(f"!! Output validation failed for {name}")
        # Optional: check if len of results matches df_main
        if len(results) != len(df_main):
            print(f"!! Length mismatch for {name}: expected {len(df_main)}, got {len(results)}")

    except Exception as e:
        print(f"Error during {name}: {e}")
        timings[name] = "Error"

print("\n--- Benchmark Results ---")
for name, t in timings.items():
    if isinstance(t, str): # Error case
        print(f"{name}: {t}")
    else:
        print(f"{name}: {t:.4f} seconds")

In [None]:
import re
from collections import Counter
from itertools import combinations
import pandas as pd

def create_fast_counter(word_list):
    """
    Create a fast word counter function using precompiled regex.
    
    Args:
        word_list (list): List of words to count
    
    Returns:
        function: Optimized counting function
    """
    # Preprocess words
    words_lower = [word.lower() for word in word_list]
    
    # Create all n-grams (1-grams and 2-grams)
    ngrams = words_lower + [' '.join(pair) for pair in combinations(words_lower, 2)]
    
    # Sort by length (longest first) to avoid partial matches
    ngrams.sort(key=len, reverse=True)
    
    # Precompile regex pattern
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(ngram) for ngram in ngrams) + r')\b', re.IGNORECASE)
    
    def count_matches(text):
        if pd.isna(text) or not text:
            return {}
        return dict(Counter(match.lower() for match in pattern.findall(str(text))))
    
    return count_matches

# Create optimized counters
fast_ai_counter = create_fast_counter(all_terms)
fast_social_counter = create_fast_counter(roles)

# Apply to dataframe (much faster than swifter for this use case)
arxiv_df['ai_word_counts'] = arxiv_df['text'].apply(fast_ai_counter)
arxiv_df['social_word_counts'] = arxiv_df['text'].apply(fast_social_counter)