Testing filtering of stop words in dataset. To do this we can use the python natural language toolkit: https://www.nltk.org/

In [4]:
import nltk
from nltk.corpus import stopwords
import string
import unicodedata
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jikael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jikael/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/jikael/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/vance.tsv', sep='\t')
df['valid'] = df['description'].map(lambda x: type(x) == str)
df=df[df['valid']]
df['topic'] = np.random.randint(0,9, size=len(df))

In [15]:
df.iloc()[1]['description']

"Sen. JD Vance, who says running as Trump's VP pick has cost him friends, is wrapping up his by asking voters to not let political differences come between them."

In [16]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer # use this one because the other one struggles with words like "don't"

tokenizer = TweetTokenizer()
 
example_sent = """This is a sample sentence,
                  showing off the stop words filtration. Don't. Trump's"""

stop_words = set(stopwords.words('english'))
 
word_tokens = tokenizer.tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether 
#they are present in stop_words or not
filtered_sentence = ' '.join([w for w in word_tokens if (not w.lower() in stop_words)])

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.', "Don't", '.', "Trump's"]
sample sentence , showing stop words filtration . . Trump's


In [None]:
def filter_sentence(sentence, stop_words, tokenizer):
    word_tokens = tokenizer.tokenize(sentence)
    word_tokens = filter(lambda x: len(x) > 1 and x != '...', word_tokens) # removes all punctuation/symbols found by tokenizer
    filtered_sentence = ' '.join([w for w in word_tokens if (not w.lower() in stop_words)]) # remove stop words
    return filtered_sentence

In [99]:
def pre_process_descriptions(df):
    stop_words = set(stopwords.words('english'))
    tokenizer = TweetTokenizer()
    df['description_processed'] = df['description'].map(lambda x: filter_sentence(x, stop_words, tokenizer))

In [100]:
pre_process_descriptions(df)

In [102]:
def get_corpus_by_category(df):
  corpus_dict = dict()
  for topic in df.topic.unique():
    filtered_df = df[df.topic == topic]
    sentence = ' '.join(filtered_df['description_processed'])
    corpus_dict[topic] = sentence
  return corpus_dict

In [97]:
corpus = get_corpus_by_category(df)

In [98]:
corpus

 7: "us Sunday night's edition HBO's Last Week Tonight John Oliver made headlines two big different reasons course 14 minute segment John Oliver shifted rundown reasons Donald Trump JD Vance Politico business leaders worried JD Vance They've watched party move away Donald Trump They're worried chosen successor could accelerate shift Business leaders watched growing frustration Donald Republican Vice-Presidential nominee JD Vance recently triggered row making controversial remarks transgender nonbinary people Ohio Senator JD Vance still likes Mark Robinson even boss campaigning North Carolina Friday Republican vice presidential nominee gave quick shout conservative outcast realizing midsentence runni latest news live updates 2024 election Follow Donald Trump Kamala Harris Texas Barack Obama JD Vance North Carolina one data point suggests Donald Trump JD Vance headed resounding victory Nov Read post Donald Trump Class Traitor Par Excellence appeared first Daily Signal Former President Do

Now we can perform tf-idf. Note this assumes that we can use the counts of each word rather than frequency.

In [83]:
corpus

 7: "us Sunday night's edition HBO's Last Week Tonight John Oliver made headlines two big different reasons course 14 minute segment John Oliver shifted rundown reasons Donald Trump JD Vance Politico business leaders worried JD Vance They've watched party move away Donald Trump They're worried chosen successor could accelerate shift Business leaders watched growing frustration Donald Republican Vice-Presidential nominee JD Vance recently triggered row making controversial remarks transgender nonbinary people Ohio Senator JD Vance still likes Mark Robinson even boss campaigning North Carolina Friday Republican vice presidential nominee gave quick shout conservative outcast realizing midsentence runni latest news live updates 2024 election Follow Donald Trump Kamala Harris Texas Barack Obama JD Vance North Carolina one data point suggests Donald Trump JD Vance headed resounding victory Nov Read post Donald Trump Class Traitor Par Excellence appeared first Daily Signal Former President Do

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

category_vectorizer = CountVectorizer()
document_vectorizer = CountVectorizer()

corpus_by_category = get_corpus_by_category(df)
corpus_by_document = df['description_processed'].to_list()

counts_by_category = np.array(category_vectorizer.fit_transform(corpus_by_category.values()).toarray()) # get counts of each word
counts_by_document = np.array(document_vectorizer.fit_transform(corpus_by_document).toarray())

# print((category_vectorizer.get_feature_names_out() == document_vectorizer.get_feature_names_out()))
# print(document_vectorizer.get_feature_names_out())

N = len(corpus_by_document)
document_counts = (counts_by_document > 0).sum(axis=0)
document_counts
idf_scores = np.log(N / document_counts) # compute idf scores by looking at every category
idf_scores.shape

(2979,)

In [145]:
def compute_n_largest_idf_scores(df, n):
    # create vectorizers
    category_vectorizer = CountVectorizer()
    document_vectorizer = CountVectorizer()

    # get corpuses by category and by document
    corpus_by_category = get_corpus_by_category(df)
    corpus_by_document = df['description_processed'].to_list()

    # get counts by category and by document
    counts_by_category = np.array(category_vectorizer.fit_transform(corpus_by_category.values()).toarray()) # get counts of each word
    counts_by_document = np.array(document_vectorizer.fit_transform(corpus_by_document).toarray())

    # feature names
    feature_names = document_vectorizer.get_feature_names_out()

    N = len(corpus_by_document)
    document_counts = (counts_by_document > 0).sum(axis=0)
    idf_scores = np.log(N / document_counts) # compute idf scores by looking at every category
    tf_idf_scores = counts_by_category * idf_scores

    top_words_by_category = dict()

    for i, category in enumerate(corpus_by_category.keys()):
        scores = tf_idf_scores[i,:]
        top_scoring_indices = scores.argsort()[-n:][::-1] # reverse since this is in ascending order
        top_scoring_words = feature_names[top_scoring_indices]
        top_scores = scores[top_scoring_indices]
        # top_words_by_category[category] = list(zip(top_scoring_words, top_scores))
        top_words_by_category[category] = top_scoring_words


    return top_words_by_category

In [149]:
for v in compute_n_largest_idf_scores(df, 10).values():
    print(v, '\n')

['president' 'trump' 'joe' 'former' 'donald' 'kamala' 'tapper' 'harris'
 'rogan' 'vice'] 

['post' 'trump' 'presidential' 'republican' 'donald' 'election' 'vice'
 'said' 'new' 'campaign'] 

['trump' 'chinese' 'presidential' 'donald' 'daily' 'raddatz' 'vice' 'sen'
 'hackers' 'news'] 

['trump' 'election' 'president' 'donald' 'harris' 'back' '2024' 'podcast'
 'mcconnell' 'wilson'] 

['presidential' 'interview' 'kamala' 'vice' 'republican' 'harris'
 'nominee' 'joe' 'candidate' 'trump'] 

['president' 'newsnation' 'trump' 'donald' 'ohio' 'town' 'hall' 'running'
 'former' 'republican'] 

['president' 'interview' 'trump' 'former' 'harris' 'kamala' 'donald'
 'said' 'sen' 'mate'] 

['nominee' 'president' 'former' 'presidential' 'vice' 'republican' 'sen'
 'targeted' 'donald' 'news'] 

['said' 'presidential' 'republican' 'nominee' 'joe' 'former' 'president'
 'trump' 'gop' 'vice'] 

