Testing filtering of stop words in dataset. To do this we can use the python natural language toolkit: https://www.nltk.org/

In [6]:
import nltk
from nltk.corpus import stopwords
import string
import unicodedata
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jikael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jikael/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/jikael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/vance.tsv', sep='\t')
df['valid'] = df['description'].map(lambda x: type(x) == str)
df=df[df['valid']]
df['topic'] = np.random.randint(0,9, size=len(df))

In [8]:
df.iloc()[1]['description']

"Sen. JD Vance, who says running as Trump's VP pick has cost him friends, is wrapping up his by asking voters to not let political differences come between them."

In [60]:
from nltk.tokenize import TweetTokenizer # use this one because the other one struggles with words like "don't"

def filter_sentence(sentence, stop_words, tokenizer):
    word_tokens = tokenizer.tokenize(sentence)
    word_tokens = filter(lambda x: len(x) > 1 and x != '...', word_tokens) # removes all punctuation/symbols found by tokenizer
    filtered_sentence = ' '.join([w for w in word_tokens if (not w.lower() in stop_words)]) # remove stop words
    return filtered_sentence

In [61]:
def pre_process_descriptions(df):
    stop_words = set(stopwords.words('english'))
    tokenizer = TweetTokenizer()
    df['description_processed'] = df['description'].map(lambda x: filter_sentence(x, stop_words, tokenizer))

In [62]:
pre_process_descriptions(df)

In [64]:
def get_corpus_by_category(df):
  corpus_dict = dict()
  for topic in df.topic.unique():
    filtered_df = df[df.topic == topic]
    sentence = ' '.join(filtered_df['description_processed'])
    corpus_dict[topic] = sentence
  return corpus_dict

In [65]:
corpus = get_corpus_by_category(df)

In [66]:
corpus

{7: "Republican vice presidential candidate JD Vance rallied supporters Atlanta taking aim Vice President Kamala Harris criticized Harris response President Joe Biden recent remarks final weeks election become race women votes met new JD Vance once-sarcastic defender nation run childless cat ladies replaced man softer side Recently Vance described bein Sen JD Vance R-OH criticized Vice President Kamala Harris bragging economy United States despite recent data showing 28,000 private sector jobs lost post JD Vance Slams Kamala Harris Bragging Economy Los disgusting President Joe Biden called supporters former President Donald Trump garbage campaign event Vice President Kamala Harris likely tip iceberg according Sen JD Vance said Breitbart Ne 2024 campaign former President Trump JD Vance joining Republican National Committee Georgia GOP filing lawsuits Sunday state federal court challenging several Georgia counties allegedly illegally remaining open Chip Somodevilla Getty Images iStock Re

Now we can perform tf-idf. Note this assumes that we can use the counts of each word rather than frequency.

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
def get_top_n_tfidf(df, n):
    out_dict = dict()
    vectorizer = CountVectorizer()
    corpus_by_category = get_corpus_by_category(df)
    categories = list(corpus_by_category.keys())
    result = vectorizer.fit_transform(corpus_by_category.values())
    tf = result.toarray()
    feature_names = vectorizer.get_feature_names_out()
    
    N = len(corpus_by_category) # numerator of idf
    document_counts = (tf > 0).sum(axis=0) # denominator of idf
    idf_scores = np.log(N / document_counts) # compute idf scores y
    tf_idf_scores = tf * idf_scores # tf-idf

    for i in range(len(tf)):
        row = tf_idf_scores[i,:]
        indices = np.argsort(row)[::-1][:n]
        words = feature_names
        out_dict[categories[i]] = [[words[i], row[i]] for i in indices]
    
    return out_dict

In [72]:
d = get_top_n_tfidf(df, 10)

In [73]:
d

{7: [['graham', 10.986122886681098],
  ['lindsey', 10.986122886681098],
  ['speaking', 7.520386983881371],
  ['shapiro', 6.591673732008658],
  ['comparison', 6.591673732008658],
  ['speaker', 6.591673732008658],
  ['johnson', 6.591673732008658],
  ['pales', 6.591673732008658],
  ['whatever', 6.591673732008658],
  ['mcconnell', 5.493061443340549]],
 3: [['watched', 4.512232190328822],
  ['agency', 4.512232190328822],
  ['move', 4.394449154672439],
  ['disgraceful', 4.394449154672439],
  ['responds', 4.394449154672439],
  ['card', 4.394449154672439],
  ['caring', 4.394449154672439],
  ['worried', 4.394449154672439],
  ['health', 4.394449154672439],
  ['they', 4.394449154672439]],
 6: [['stay', 4.394449154672439],
  ['native', 4.394449154672439],
  ['lawrence', 4.394449154672439],
  ['austin', 4.394449154672439],
  ['bronx', 4.394449154672439],
  ['koul', 4.394449154672439],
  ['boy', 4.394449154672439],
  ['construction', 4.394449154672439],
  ['barbershop', 4.394449154672439],
  ['sites

In [74]:
for v in get_top_n_tfidf(df, 10).values():
    print(v, '\n')

[['graham', 10.986122886681098], ['lindsey', 10.986122886681098], ['speaking', 7.520386983881371], ['shapiro', 6.591673732008658], ['comparison', 6.591673732008658], ['speaker', 6.591673732008658], ['johnson', 6.591673732008658], ['pales', 6.591673732008658], ['whatever', 6.591673732008658], ['mcconnell', 5.493061443340549]] 

[['watched', 4.512232190328822], ['agency', 4.512232190328822], ['move', 4.394449154672439], ['disgraceful', 4.394449154672439], ['responds', 4.394449154672439], ['card', 4.394449154672439], ['caring', 4.394449154672439], ['worried', 4.394449154672439], ['health', 4.394449154672439], ['they', 4.394449154672439]] 

[['stay', 4.394449154672439], ['native', 4.394449154672439], ['lawrence', 4.394449154672439], ['austin', 4.394449154672439], ['bronx', 4.394449154672439], ['koul', 4.394449154672439], ['boy', 4.394449154672439], ['construction', 4.394449154672439], ['barbershop', 4.394449154672439], ['sites', 4.394449154672439]] 

[['level', 6.591673732008658], ['target