Testing filtering of stop words in dataset. To do this we can use the python natural language toolkit: https://www.nltk.org/

In [6]:
import nltk
from nltk.corpus import stopwords
import string
import unicodedata
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jikael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jikael/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/jikael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/vance.tsv', sep='\t')
df['valid'] = df['description'].map(lambda x: type(x) == str)
df=df[df['valid']]
df['topic'] = np.random.randint(0,9, size=len(df))

In [8]:
df.iloc()[1]['description']

"Sen. JD Vance, who says running as Trump's VP pick has cost him friends, is wrapping up his by asking voters to not let political differences come between them."

In [9]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer # use this one because the other one struggles with words like "don't"

tokenizer = TweetTokenizer()
 
example_sent = """This is a sample sentence,
                  showing off the stop words filtration. Don't. Trump's"""

stop_words = set(stopwords.words('english'))
 
word_tokens = tokenizer.tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether 
#they are present in stop_words or not
filtered_sentence = ' '.join([w for w in word_tokens if (not w.lower() in stop_words)])

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.', "Don't", '.', "Trump's"]
sample sentence , showing stop words filtration . . Trump's


In [23]:
import re
def filter_stop_words(title, stop_words):
    title = title.lower()
    words = re.split('[^a-zA-Z]', title)
    return ' '.join([word.translate(str.maketrans('', '', string.punctuation)) for word in words if (len(word) > 1) and (word not in stop_words)])

In [10]:
def filter_sentence(sentence, stop_words, tokenizer):
    word_tokens = tokenizer.tokenize(sentence)
    word_tokens = filter(lambda x: len(x) > 1 and x != '...', word_tokens) # removes all punctuation/symbols found by tokenizer
    filtered_sentence = ' '.join([w for w in word_tokens if (not w.lower() in stop_words)]) # remove stop words
    return filtered_sentence

In [24]:
def pre_process_descriptions(df):
    stop_words = set(stopwords.words('english'))
    tokenizer = TweetTokenizer()
    df['description_processed'] = df['description'].map(lambda x: filter_stop_words(x, stop_words))

In [25]:
pre_process_descriptions(df)

In [26]:
df

Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,publishedAt,topic,negative/neutral/positive,valid,description_processed
0,2,Biztoc.com,breakingthenews.net,"JD Vance slams Harris: 'Tomorrow, you're fired'",Republican vice presidential candidate JD Vanc...,https://biztoc.com/x/c4c538f75498736e,2024-11-04T22:53:56Z,7,,True,republican vice presidential candidate jd vanc...
1,3,Biztoc.com,nbcnews.com,'Feels kind of weird': Vance reflects and look...,"Sen. JD Vance, who says running as Trump's VP ...",https://biztoc.com/x/339813fc9b9bc9be,2024-11-04T22:53:24Z,3,,True,sen jd vance says running trump vp pick cost f...
2,6,NBC News,Henry J. Gomez and Alec Hernández,'Feels kind of weird': Vance reflects and look...,"Sen. JD Vance, who says running as Trump's VP ...",https://www.nbcnews.com/politics/2024-election...,2024-11-04T22:15:42Z,6,,True,sen jd vance says running trump vp pick cost f...
3,7,Bleeding Cool News,Ray Flook,Last Week Tonight: HBO Releases John Oliver Se...,"For us, Sunday night's edition of HBO's Last W...",https://bleedingcool.com/tv/last-week-tonight-...,2024-11-04T15:07:08Z,5,,True,us sunday night edition hbo last week tonight ...
4,10,New York Post,Emily Crane,JD Vance says Trump is ‘fired up’ over death o...,Republican vice presidential candidate JD Vanc...,https://nypost.com/2024/11/04/us-news/jd-vance...,2024-11-04T15:06:04Z,8,,True,republican vice presidential candidate jd vanc...
...,...,...,...,...,...,...,...,...,...,...,...
507,65,Slashdot.org,feedfeeder,Jake Tapper and JD Vance spar over John Kelly....,Jake Tapper and JD Vance spar over John Kelly....,https://slashdot.org/firehose.pl?op=view&amp;i...,2024-10-27T17:13:15Z,7,,True,jake tapper jd vance spar john kelly watch ful...
508,66,Slashdot.org,feedfeeder,"Vance, Tapper spar over ex-Trump administratio...","Vance, Tapper spar over ex-Trump administratio...",https://slashdot.org/firehose.pl?op=view&amp;i...,2024-10-27T17:52:36Z,4,,True,vance tapper spar ex trump administration offi...
509,67,New York Post,Carson Swick,JD Vance slams Biden-Harris school lunch polic...,Sen. JD Vance (R-Ohio) kept politics mostly ou...,https://nypost.com/2024/10/26/us-news/jd-vance...,2024-10-27T02:35:21Z,4,,True,sen jd vance ohio kept politics mostly appeara...
510,68,CNN,Kit Maher,Vance insists Trump’s ‘enemy from within’ comm...,Republican vice presidential nominee JD Vance ...,https://www.cnn.com/2024/10/27/politics/jd-van...,2024-10-27T13:00:49Z,2,,True,republican vice presidential nominee jd vance ...


In [27]:
def get_corpus_by_category(df):
  corpus_dict = dict()
  for topic in df.topic.unique():
    filtered_df = df[df.topic == topic]
    sentence = ' '.join(filtered_df['description_processed'])
    corpus_dict[topic] = sentence
  return corpus_dict

In [28]:
corpus = get_corpus_by_category(df)

In [29]:
corpus

{7: 'republican vice presidential candidate jd vance rallied supporters atlanta taking aim vice president kamala harris criticized harris response president joe biden recent remarks final weeks election become race women votes met new jd vance sarcastic defender nation run childless cat ladies replaced man softer side recently vance described bein sen jd vance oh criticized vice president kamala harris bragging economy united states despite recent data showing private sector jobs lost post jd vance slams kamala harris bragging economy los disgusting president joe biden called supporters former president donald trump garbage campaign event vice president kamala harris likely tip iceberg according sen jd vance said breitbart ne campaign former president trump jd vance joining republican national committee georgia gop filing lawsuits sunday state federal court challenging several georgia counties allegedly illegally remaining open chip somodevilla getty images istock rebecca zisser bi pre

Now we can perform tf-idf. Note this assumes that we can use the counts of each word rather than frequency.

In [30]:
corpus

{7: 'republican vice presidential candidate jd vance rallied supporters atlanta taking aim vice president kamala harris criticized harris response president joe biden recent remarks final weeks election become race women votes met new jd vance sarcastic defender nation run childless cat ladies replaced man softer side recently vance described bein sen jd vance oh criticized vice president kamala harris bragging economy united states despite recent data showing private sector jobs lost post jd vance slams kamala harris bragging economy los disgusting president joe biden called supporters former president donald trump garbage campaign event vice president kamala harris likely tip iceberg according sen jd vance said breitbart ne campaign former president trump jd vance joining republican national committee georgia gop filing lawsuits sunday state federal court challenging several georgia counties allegedly illegally remaining open chip somodevilla getty images istock rebecca zisser bi pre

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

category_vectorizer = CountVectorizer()
document_vectorizer = CountVectorizer()

corpus_by_category = get_corpus_by_category(df)
corpus_by_document = df['description_processed'].to_list()

counts_by_category = np.array(category_vectorizer.fit_transform(corpus_by_category.values()).toarray()) # get counts of each word
counts_by_document = np.array(document_vectorizer.fit_transform(corpus_by_document).toarray())

# print((category_vectorizer.get_feature_names_out() == document_vectorizer.get_feature_names_out()))
# print(document_vectorizer.get_feature_names_out())

N = len(corpus_by_document)
document_counts = (counts_by_document > 0).sum(axis=0)
document_counts
idf_scores = np.log(N / document_counts) # compute idf scores by looking at every category
idf_scores.shape

(2918,)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_top_n_tfidf(df, n):
    out_dict = dict()
    tfidf = TfidfVectorizer()
    corpus_by_category = get_corpus_by_category(df)
    categories = list(corpus_by_category.keys())
    result = tfidf.fit_transform(corpus_by_category.values())
    X = result.toarray()
    print(X.shape)
    feature_names = tfidf.get_feature_names_out()
    print(feature_names)

    for i in range(len(X)):
        row = result.toarray()[i,:]
        indices = np.argsort(row)[::-1][:10]
        words = feature_names
        out_dict[categories[i]] = [[words[i], row[i]] for i in indices]
    
    return out_dict

In [32]:
d = get_top_n_tfidf(df, 10)

(9, 2918)
['aaron' 'abc' 'abducting' ... 'zisser' 'zoomed' 'zuckerberg']


In [18]:
d

{7: [['vance', 0.5273383751176066],
  ['jd', 0.4254661890153417],
  ['trump', 0.3116090398422221],
  ['donald', 0.1737819645273931],
  ['president', 0.16179700145653841],
  ['said', 0.13782707531482902],
  ['vice', 0.13183459377940165],
  ['presidential', 0.10786466763769227],
  ['post', 0.10187218610226492],
  ['republican', 0.09587970456683757]],
 3: [['vance', 0.5088096145775014],
  ['jd', 0.417950754831519],
  ['trump', 0.30892012313634015],
  ['sen', 0.17263183351736655],
  ['donald', 0.17263183351736655],
  ['presidential', 0.14537417559357182],
  ['vice', 0.1362882896189736],
  ['republican', 0.12720240364437535],
  ['harris', 0.11811651766977711],
  ['former', 0.11811651766977711]],
 6: [['vance', 0.5283863642838238],
  ['jd', 0.40362847271680985],
  ['trump', 0.3375801771813319],
  ['donald', 0.16879008859066594],
  ['president', 0.15411268958278196],
  ['former', 0.13943529057489795],
  ['vice', 0.13209659107095595],
  ['presidential', 0.12475789156701396],
  ['republican', 0

In [145]:
def compute_n_largest_idf_scores(df, n):
    # create vectorizers
    category_vectorizer = CountVectorizer()
    document_vectorizer = CountVectorizer()

    # get corpuses by category and by document
    corpus_by_category = get_corpus_by_category(df)
    corpus_by_document = df['description_processed'].to_list()

    # get counts by category and by document
    counts_by_category = np.array(category_vectorizer.fit_transform(corpus_by_category.values()).toarray()) # get counts of each word
    counts_by_document = np.array(document_vectorizer.fit_transform(corpus_by_document).toarray())

    # feature names
    feature_names = document_vectorizer.get_feature_names_out()

    N = len(corpus_by_document)
    document_counts = (counts_by_document > 0).sum(axis=0)
    idf_scores = np.log(N / document_counts) # compute idf scores by looking at every category
    tf_idf_scores = counts_by_category * idf_scores

    top_words_by_category = dict()

    for i, category in enumerate(corpus_by_category.keys()):
        scores = tf_idf_scores[i,:]
        top_scoring_indices = scores.argsort()[-n:][::-1] # reverse since this is in ascending order
        top_scoring_words = feature_names[top_scoring_indices]
        top_scores = scores[top_scoring_indices]
        # top_words_by_category[category] = list(zip(top_scoring_words, top_scores))
        top_words_by_category[category] = top_scoring_words


    return top_words_by_category

In [149]:
for v in compute_n_largest_idf_scores(df, 10).values():
    print(v, '\n')

['president' 'trump' 'joe' 'former' 'donald' 'kamala' 'tapper' 'harris'
 'rogan' 'vice'] 

['post' 'trump' 'presidential' 'republican' 'donald' 'election' 'vice'
 'said' 'new' 'campaign'] 

['trump' 'chinese' 'presidential' 'donald' 'daily' 'raddatz' 'vice' 'sen'
 'hackers' 'news'] 

['trump' 'election' 'president' 'donald' 'harris' 'back' '2024' 'podcast'
 'mcconnell' 'wilson'] 

['presidential' 'interview' 'kamala' 'vice' 'republican' 'harris'
 'nominee' 'joe' 'candidate' 'trump'] 

['president' 'newsnation' 'trump' 'donald' 'ohio' 'town' 'hall' 'running'
 'former' 'republican'] 

['president' 'interview' 'trump' 'former' 'harris' 'kamala' 'donald'
 'said' 'sen' 'mate'] 

['nominee' 'president' 'former' 'presidential' 'vice' 'republican' 'sen'
 'targeted' 'donald' 'news'] 

['said' 'presidential' 'republican' 'nominee' 'joe' 'former' 'president'
 'trump' 'gop' 'vice'] 

