# POS tagging lists

This notebook is used to create the lists of words for further bias identification using the POS tagging.

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
df = pd.read_csv('../data/clean/clean_posts_for_POS.csv')

In [3]:
df.head()

Unnamed: 0,PostId,POS_tagging_text
0,80074859,While in Camden today I had the privilege to v...
1,80069324,I had a great visit this afternoon with the Wi...
2,80072838,"On November 29, the State Canvassing Board cer..."
3,80076411,The State Canvassing Board certified the 2022 ...
4,80107630,: To all Baltimore County Election Judges who ...


In [6]:
len(df)

22311

In [4]:
# count NaN values
df.isna().sum()

PostId              0
POS_tagging_text    0
dtype: int64

In [5]:
import spacy
import pandas as pd
import re
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_trf", disable=["ner", "parser"]) # en_core_web_trf en_core_web_lg

# Enhanced POS categorization
def extract_pos(doc):
    nouns = []
    adjectives = []
    aux_verbs = []
    verbs = []
    adverbs = []
    pronouns = []
    other = []  # For interjections (INTJ) and other non-specified categories
    
    for token in doc:
        word = token.text.lower()
        pos = token.pos_
        
        if pos in ["NOUN", "PROPN"]:
            nouns.append(word)
        elif pos == "ADJ":
            adjectives.append(word)
        elif pos == "AUX":
            aux_verbs.append(word)
        elif pos == "VERB":
            verbs.append(word)
        elif pos == "ADV":
            adverbs.append(word)
        elif pos == "PRON":
            pronouns.append(word)
        elif pos == "INTJ":  # Interjections
            other.append(word)
    
    return nouns, adjectives, aux_verbs, verbs, adverbs, pronouns, other

# Process texts and store results
df['pos_results'] = df['POS_tagging_text'].apply(lambda x: extract_pos(nlp(x)))

# Split into separate columns
pos_columns = ['nouns', 'adjectives', 'aux_verbs', 'verbs', 'adverbs', 'pronouns', 'other']
df[pos_columns] = pd.DataFrame(df['pos_results'].tolist(), index=df.index)

# Generate CSVs for each category
category_mapping = {
    'adjectives': 'Adjective',
    'nouns': 'Noun',
    'aux_verbs': 'Auxiliary_Verb',
    'verbs': 'Verb',
    'adverbs': 'Adverb',
    'pronouns': 'Pronoun',
    'other': 'Other'
}

for df_col, category in category_mapping.items():
    # Flatten all words in the category
    all_words = [word for sublist in df[df_col] for word in sublist]
    
    # Count occurrences
    word_counts = Counter(all_words)
    
    # Create DataFrame and save
    count_df = pd.DataFrame({
        'word': list(word_counts.keys()),
        'count': list(word_counts.values())
    }).sort_values('count', ascending=False)
    
    count_df.to_csv(f"{category}_counts.csv", index=False)
    print(f"Created {category}_counts.csv with {len(count_df)} unique words")

# Optional: Clean up intermediate columns
df.drop(columns=['pos_results'], inplace=True)

  from .autonotebook import tqdm as notebook_tqdm


Created Adjective_counts.csv with 2028 unique words
Created Noun_counts.csv with 11106 unique words
Created Auxiliary_Verb_counts.csv with 58 unique words
Created Verb_counts.csv with 4079 unique words
Created Adverb_counts.csv with 620 unique words
Created Pronoun_counts.csv with 112 unique words
Created Other_counts.csv with 142 unique words
