In [3]:
import pandas as pd
from htrc_features import Volume
from collections import Counter, defaultdict
import numpy as np
import re

In [4]:
article_df = pd.read_csv('../articles.csv')

In [5]:
STOPS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
         'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
         'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
         'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
         'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
         'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
         'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
         'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
         'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
         'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
         'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
         'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 've', 'll', 'also', "!", "it's","user", "-"]

In [6]:
# Initialize dictionary to keep track of the number of articles each word appears in
word_article_count = {}
# Initialize Counter to keep track of the frequency of each word
word_freq = Counter()

# Iterate through each row in the tweet DataFrame
for idx, row in article_df.iterrows():
    # Get the text of the tweet, convert it to lowercase, and split into individual words (tokens)
    text = row['text']
    tokens = text.lower().split()
    
    # Get the unique set of tokens to avoid counting the same word multiple times in the same article
    unique_tokens = set(tokens)
    
    # Update the word frequency counter with all tokens
    word_freq.update(tokens)
    
    # For each unique token, increment the tweet count in which the word appears
    for token in unique_tokens:
        if not token in word_article_count.keys():
            word_article_count[token] = 0
        word_article_count[token] += 1

# Create a DataFrame from the word frequency counter
word_freq_df = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])

# Add a new column to the DataFrame for the number of tweets each word is mentioned in
word_freq_df['Articles Mentioned'] = word_freq_df['Word'].apply(lambda x: word_article_count[x])

In [7]:
# Define TF-IDF function
def calculate_tfidf(word_freq, total_articles, articles_mentioned):
    idf = np.log((total_articles) / (articles_mentioned)) + 1
    return word_freq * idf

# Apply function to dataframe rows
word_freq_df['TF-IDF'] = word_freq_df.apply(lambda row: calculate_tfidf(row['Frequency'], len(article_df), row['Articles Mentioned']), axis=1)

# Remove rows with stop words
word_freq_df = word_freq_df[~word_freq_df['Word'].isin(STOPS)]

# Remove all non-letter rows
word_freq_df = word_freq_df[word_freq_df['Word'].str.contains(r'[a-zA-Z]', regex=True)]

# Sort by TF-IDF for display
word_freq_df = word_freq_df.sort_values(by='TF-IDF', ascending=False)

In [8]:
word_freq_df.head(20)

Unnamed: 0,Word,Frequency,Articles Mentioned,TF-IDF
640,learning,2026,256,2582.958514
1049,data,1966,259,2483.559072
1776,neural,1500,206,2238.310142
1997,network,1288,184,2067.429558
188,machine,1481,237,2002.345751
80,one,1654,297,1862.984209
1406,model,897,143,1665.942755
71,would,1230,250,1597.305075
473,like,1413,298,1586.784084
1545,deep,991,187,1574.673545
